xref: /csrg-svn/lib/libedit/tokenizer.c (revision 61276)
154240Sbostic /*-
2*61276Sbostic  * Copyright (c) 1992, 1993
3*61276Sbostic  *	The Regents of the University of California.  All rights reserved.
454240Sbostic  *
554240Sbostic  * This code is derived from software contributed to Berkeley by
654240Sbostic  * Christos Zoulas of Cornell University.
754240Sbostic  *
854240Sbostic  * %sccs.include.redist.c%
954240Sbostic  */
1054240Sbostic 
1154624Schristos #if !defined(lint) && !defined(SCCSID)
12*61276Sbostic static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 06/04/93";
1354624Schristos #endif /* not lint && not SCCSID */
1454240Sbostic 
1554240Sbostic /*
1654240Sbostic  * tokenize.c: Bourne shell like tokenizer
1754240Sbostic  */
1854240Sbostic #include "sys.h"
1954240Sbostic #include <string.h>
2054240Sbostic #include <stdlib.h>
2154240Sbostic #include "tokenizer.h"
2254240Sbostic 
2354240Sbostic typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
2454240Sbostic 
2554240Sbostic #define IFS "\t \n"
2654240Sbostic 
2754240Sbostic #define TOK_KEEP	1
2854240Sbostic #define TOK_EAT		2
2954240Sbostic 
3054240Sbostic #define WINCR 20
3154240Sbostic #define AINCR 10
3254240Sbostic 
3354240Sbostic #define tok_malloc(a)		malloc(a)
3454240Sbostic #define tok_free(a)		free(a)
3554240Sbostic #define tok_realloc(a, b)	realloc(a, b)
3654240Sbostic 
3754240Sbostic 
3854240Sbostic struct tokenizer {
3954240Sbostic     char   *ifs;		/* In field separator			*/
4054240Sbostic     int     argc, amax;		/* Current and maximum number of args	*/
4154240Sbostic     char  **argv;		/* Argument list			*/
4254240Sbostic     char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
4354240Sbostic     char   *wstart;		/* Beginning of next word		*/
4454240Sbostic     char   *wspace;		/* Space of word buffer			*/
4554240Sbostic     quote_t quote;		/* Quoting state			*/
4654240Sbostic     int	    flags;		/* flags;				*/
4754240Sbostic };
4854240Sbostic 
4954240Sbostic 
5054240Sbostic private void tok_finish	__P((Tokenizer *));
5154240Sbostic 
5254240Sbostic 
5354240Sbostic /* tok_finish():
5454240Sbostic  *	Finish a word in the tokenizer.
5554240Sbostic  */
5654240Sbostic private void
tok_finish(tok)5754240Sbostic tok_finish(tok)
5854240Sbostic     Tokenizer *tok;
5954240Sbostic {
6054240Sbostic     *tok->wptr = '\0';
6154240Sbostic     if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
6254240Sbostic 	tok->argv[tok->argc++] = tok->wstart;
6354240Sbostic 	tok->argv[tok->argc] = NULL;
6454240Sbostic 	tok->wstart = ++tok->wptr;
6554240Sbostic     }
6654240Sbostic     tok->flags &= ~TOK_KEEP;
6754240Sbostic }
6854240Sbostic 
6954240Sbostic 
7054240Sbostic /* tok_init():
7154240Sbostic  *	Initialize the tokenizer
7254240Sbostic  */
7354240Sbostic public Tokenizer *
tok_init(ifs)7454240Sbostic tok_init(ifs)
7554240Sbostic     const char *ifs;
7654240Sbostic {
7754240Sbostic     Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
7854240Sbostic 
7954240Sbostic     tok->ifs     = strdup(ifs ? ifs : IFS);
8054240Sbostic     tok->argc    = 0;
8154240Sbostic     tok->amax    = AINCR;
8254240Sbostic     tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
8354240Sbostic     tok->argv[0] = NULL;
8454240Sbostic     tok->wspace  = (char *) tok_malloc(WINCR);
8554240Sbostic     tok->wmax    = tok->wspace + WINCR;
8654240Sbostic     tok->wstart  = tok->wspace;
8754240Sbostic     tok->wptr    = tok->wspace;
8854240Sbostic     tok->flags   = 0;
8954240Sbostic     tok->quote   = Q_none;
9054240Sbostic 
9154240Sbostic     return tok;
9254240Sbostic }
9354240Sbostic 
9454240Sbostic 
9554240Sbostic /* tok_reset():
9654240Sbostic  *	Reset the tokenizer
9754240Sbostic  */
9854240Sbostic public void
tok_reset(tok)9954240Sbostic tok_reset(tok)
10054240Sbostic     Tokenizer *tok;
10154240Sbostic {
10254240Sbostic     tok->argc  = 0;
10354240Sbostic     tok->wstart = tok->wspace;
10454240Sbostic     tok->wptr = tok->wspace;
10554240Sbostic     tok->flags = 0;
10654240Sbostic     tok->quote = Q_none;
10754240Sbostic }
10854240Sbostic 
10954240Sbostic 
11054240Sbostic /* tok_end():
11154240Sbostic  *	Clean up
11254240Sbostic  */
11354240Sbostic public void
tok_end(tok)11454240Sbostic tok_end(tok)
11554240Sbostic     Tokenizer *tok;
11654240Sbostic {
11754240Sbostic     tok_free((ptr_t) tok->ifs);
11854240Sbostic     tok_free((ptr_t) tok->wspace);
11954240Sbostic     tok_free((ptr_t) tok->argv);
12054240Sbostic     tok_free((ptr_t) tok);
12154240Sbostic }
12254240Sbostic 
12354240Sbostic 
12454240Sbostic 
12554240Sbostic /* tok_line():
12654240Sbostic  *	Bourne shell like tokenizing
12754240Sbostic  *	Return:
12854240Sbostic  *		-1: Internal error
12954240Sbostic  *		 3: Quoted return
13054240Sbostic  *		 2: Unmatched double quote
13154240Sbostic  *		 1: Unmatched single quote
13254240Sbostic  *		 0: Ok
13354240Sbostic  */
13454240Sbostic public int
tok_line(tok,line,argc,argv)13554240Sbostic tok_line(tok, line, argc, argv)
13654240Sbostic     Tokenizer *tok;
13754240Sbostic     const char* line;
13854240Sbostic     int *argc;
13954240Sbostic     char ***argv;
14054240Sbostic {
14154240Sbostic     const char *ptr;
14254240Sbostic 
14354240Sbostic     while (1) {
14454240Sbostic 	switch (*(ptr = line++)) {
14554240Sbostic 	case '\'':
14654240Sbostic 	    tok->flags |= TOK_KEEP;
14754240Sbostic 	    tok->flags &= ~TOK_EAT;
14854240Sbostic 	    switch (tok->quote) {
14954240Sbostic 	    case Q_none:
15054240Sbostic 		tok->quote = Q_single;	/* Enter single quote mode */
15154240Sbostic 		break;
15254240Sbostic 
15354240Sbostic 	    case Q_single:		/* Exit single quote mode */
15454240Sbostic 		tok->quote = Q_none;
15554240Sbostic 		break;
15654240Sbostic 
15754240Sbostic 	    case Q_one:			/* Quote this ' */
15854240Sbostic 		tok->quote = Q_none;
15954240Sbostic 		*tok->wptr++ = *ptr;
16054240Sbostic 		break;
16154240Sbostic 
16254240Sbostic 	    case Q_double:		/* Stay in double quote mode */
16354240Sbostic 		*tok->wptr++ = *ptr;
16454240Sbostic 		break;
16554240Sbostic 
16654240Sbostic 	    case Q_doubleone:		/* Quote this ' */
16754240Sbostic 		tok->quote = Q_double;
16854240Sbostic 		*tok->wptr++ = *ptr;
16954240Sbostic 		break;
17054240Sbostic 
17154240Sbostic 	    default:
17254240Sbostic 		return(-1);
17354240Sbostic 	    }
17454240Sbostic 	    break;
17554240Sbostic 
17654240Sbostic 	case '"':
17754240Sbostic 	    tok->flags &= ~TOK_EAT;
17854240Sbostic 	    tok->flags |= TOK_KEEP;
17954240Sbostic 	    switch (tok->quote) {
18054240Sbostic 	    case Q_none:		/* Enter double quote mode */
18154240Sbostic 		tok->quote = Q_double;
18254240Sbostic 		break;
18354240Sbostic 
18454240Sbostic 	    case Q_double:
18554240Sbostic 		tok->quote = Q_none;	/* Exit double quote mode */
18654240Sbostic 		break;
18754240Sbostic 
18854240Sbostic 	    case Q_one:			/* Quote this " */
18954240Sbostic 		tok->quote = Q_none;
19054240Sbostic 		*tok->wptr++ = *ptr;
19154240Sbostic 		break;
19254240Sbostic 
19354240Sbostic 	    case Q_single:		/* Stay in single quote mode */
19454240Sbostic 		*tok->wptr++ = *ptr;
19554240Sbostic 		break;
19654240Sbostic 
19754240Sbostic 	    case Q_doubleone:		/* Quote this " */
19854240Sbostic 		tok->quote = Q_double;
19954240Sbostic 		*tok->wptr++ = *ptr;
20054240Sbostic 		break;
20154240Sbostic 
20254240Sbostic 	    default:
20354240Sbostic 		return(-1);
20454240Sbostic 	    }
20554240Sbostic 	    break;
20654240Sbostic 
20754240Sbostic 	case '\\':
20854240Sbostic 	    tok->flags |= TOK_KEEP;
20954240Sbostic 	    tok->flags &= ~TOK_EAT;
21054240Sbostic 	    switch (tok->quote) {
21154240Sbostic 	    case Q_none:		/* Quote next character */
21254240Sbostic 		tok->quote = Q_one;
21354240Sbostic 		break;
21454240Sbostic 
21554240Sbostic 	    case Q_double:
21654240Sbostic 		tok->quote = Q_doubleone;/* Quote next character */
21754240Sbostic 		break;
21854240Sbostic 
21954240Sbostic 	    case Q_one:
22054240Sbostic 		*tok->wptr++ = *ptr;
22154240Sbostic 		tok->quote = Q_none;	/* Quote this, restore state */
22254240Sbostic 		break;
22354240Sbostic 
22454240Sbostic 	    case Q_single:		/* Stay in single quote mode */
22554240Sbostic 		*tok->wptr++ = *ptr;
22654240Sbostic 		break;
22754240Sbostic 
22854240Sbostic 	    case Q_doubleone:		/* Quote this \ */
22954240Sbostic 		tok->quote = Q_double;
23054240Sbostic 		*tok->wptr++ = *ptr;
23154240Sbostic 		break;
23254240Sbostic 
23354240Sbostic 	    default:
23454240Sbostic 		return(-1);
23554240Sbostic 	    }
23654240Sbostic 	    break;
23754240Sbostic 
23854240Sbostic 	case '\n':
23954240Sbostic 	    tok->flags &= ~TOK_EAT;
24054240Sbostic 	    switch (tok->quote) {
24154240Sbostic 	    case Q_none:
24254240Sbostic 		tok_finish(tok);
24354240Sbostic 		*argv = tok->argv;
24454240Sbostic 		*argc = tok->argc;
24554240Sbostic 		return(0);
24654240Sbostic 
24754240Sbostic 	    case Q_single:
24854240Sbostic 	    case Q_double:
24954240Sbostic 		*tok->wptr++ = *ptr;	/* Add the return		*/
25054240Sbostic 		break;
25154240Sbostic 
25254240Sbostic 	    case Q_doubleone:
25354240Sbostic 		tok->flags |= TOK_EAT;
25454240Sbostic 		tok->quote = Q_double;	/* Back to double, eat the '\n' */
25554240Sbostic 		break;
25654240Sbostic 
25754240Sbostic 	    case Q_one:
25854240Sbostic 		tok->flags |= TOK_EAT;
25954240Sbostic 		tok->quote = Q_none;	/* No quote, more eat the '\n' */
26054240Sbostic 		break;
26154240Sbostic 
26254240Sbostic 	    default:
26354240Sbostic 		return(0);
26454240Sbostic 	    }
26554240Sbostic 	    break;
26654240Sbostic 
26754240Sbostic 	case '\0':
26854240Sbostic 	    switch (tok->quote) {
26954240Sbostic 	    case Q_none:
27054240Sbostic 		/* Finish word and return */
27154240Sbostic 		if (tok->flags & TOK_EAT) {
27254240Sbostic 		    tok->flags &= ~TOK_EAT;
27354240Sbostic 		    return 3;
27454240Sbostic 		}
27554240Sbostic 		tok_finish(tok);
27654240Sbostic 		*argv = tok->argv;
27754240Sbostic 		*argc = tok->argc;
27854240Sbostic 		return(0);
27954240Sbostic 
28054240Sbostic 	    case Q_single:
28154240Sbostic 		return(1);
28254240Sbostic 
28354240Sbostic 	    case Q_double:
28454240Sbostic 		return(2);
28554240Sbostic 
28654240Sbostic 	    case Q_doubleone:
28754240Sbostic 		tok->quote = Q_double;
28854240Sbostic 		*tok->wptr++ = *ptr;
28954240Sbostic 		break;
29054240Sbostic 
29154240Sbostic 	    case Q_one:
29254240Sbostic 		tok->quote = Q_none;
29354240Sbostic 		*tok->wptr++ = *ptr;
29454240Sbostic 		break;
29554240Sbostic 
29654240Sbostic 	    default:
29754240Sbostic 		return(-1);
29854240Sbostic 	    }
29954240Sbostic 	    break;
30054240Sbostic 
30154240Sbostic 	default:
30254240Sbostic 	    tok->flags &= ~TOK_EAT;
30354240Sbostic 	    switch (tok->quote) {
30454240Sbostic 	    case Q_none:
30554240Sbostic 		if (strchr(tok->ifs, *ptr) != NULL)
30654240Sbostic 		    tok_finish(tok);
30754240Sbostic 		else
30854240Sbostic 		    *tok->wptr++ = *ptr;
30954240Sbostic 		break;
31054240Sbostic 
31154240Sbostic 	    case Q_single:
31254240Sbostic 	    case Q_double:
31354240Sbostic 		*tok->wptr++ = *ptr;
31454240Sbostic 		break;
31554240Sbostic 
31654240Sbostic 
31754240Sbostic 	    case Q_doubleone:
31854240Sbostic 		*tok->wptr++ = '\\';
31954240Sbostic 		tok->quote = Q_double;
32054240Sbostic 		*tok->wptr++ = *ptr;
32154240Sbostic 		break;
32254240Sbostic 
32354240Sbostic 	    case Q_one:
32454240Sbostic 		tok->quote = Q_none;
32554240Sbostic 		*tok->wptr++ = *ptr;
32654240Sbostic 		break;
32754240Sbostic 
32854240Sbostic 	    default:
32954240Sbostic 		return(-1);
33054240Sbostic 
33154240Sbostic 	    }
33254240Sbostic 	    break;
33354240Sbostic 	}
33454240Sbostic 
33554240Sbostic 	if (tok->wptr >= tok->wmax - 4) {
33654240Sbostic 	    size_t size = tok->wmax - tok->wspace + WINCR;
33754240Sbostic 	    char *s = (char *) tok_realloc(tok->wspace, size);
33854240Sbostic 	    /*SUPPRESS 22*/
33954240Sbostic 	    int offs = s - tok->wspace;
34054240Sbostic 
34154240Sbostic 	    if (offs != 0) {
34254240Sbostic 		int i;
34354240Sbostic 		for (i = 0; i < tok->argc; i++)
34454240Sbostic 		    tok->argv[i] = tok->argv[i] + offs;
34554240Sbostic 		tok->wptr   = tok->wptr + offs;
34654240Sbostic 		tok->wstart = tok->wstart + offs;
34754240Sbostic 		tok->wmax   = s + size;
34854240Sbostic 		tok->wspace = s;
34954240Sbostic 	    }
35054240Sbostic 	}
35154240Sbostic 
35254240Sbostic 	if (tok->argc >= tok->amax - 4) {
35354240Sbostic 	    tok->amax += AINCR;
35454240Sbostic 	    tok->argv = (char **) tok_realloc(tok->argv,
35554240Sbostic 					      tok->amax * sizeof(char*));
35654240Sbostic 	}
35754240Sbostic 
35854240Sbostic     }
35954240Sbostic }
360