xref: /csrg-svn/lib/libedit/tokenizer.c (revision 54240)
1*54240Sbostic /*-
2*54240Sbostic  * Copyright (c) 1992 The Regents of the University of California.
3*54240Sbostic  * All rights reserved.
4*54240Sbostic  *
5*54240Sbostic  * This code is derived from software contributed to Berkeley by
6*54240Sbostic  * Christos Zoulas of Cornell University.
7*54240Sbostic  *
8*54240Sbostic  * %sccs.include.redist.c%
9*54240Sbostic  */
10*54240Sbostic 
11*54240Sbostic #ifndef lint
12*54240Sbostic static char sccsid[] = "@(#)tokenizer.c	5.1 (Berkeley) 06/22/92";
13*54240Sbostic #endif /* not lint */
14*54240Sbostic 
15*54240Sbostic /*
16*54240Sbostic  * tokenize.c: Bourne shell like tokenizer
17*54240Sbostic  */
18*54240Sbostic #include "sys.h"
19*54240Sbostic #include <string.h>
20*54240Sbostic #include <stdlib.h>
21*54240Sbostic #include "tokenizer.h"
22*54240Sbostic 
23*54240Sbostic typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
24*54240Sbostic 
25*54240Sbostic #define IFS "\t \n"
26*54240Sbostic 
27*54240Sbostic #define TOK_KEEP	1
28*54240Sbostic #define TOK_EAT		2
29*54240Sbostic 
30*54240Sbostic #define WINCR 20
31*54240Sbostic #define AINCR 10
32*54240Sbostic 
33*54240Sbostic #define tok_malloc(a)		malloc(a)
34*54240Sbostic #define tok_free(a)		free(a)
35*54240Sbostic #define tok_realloc(a, b)	realloc(a, b)
36*54240Sbostic 
37*54240Sbostic 
38*54240Sbostic struct tokenizer {
39*54240Sbostic     char   *ifs;		/* In field separator			*/
40*54240Sbostic     int     argc, amax;		/* Current and maximum number of args	*/
41*54240Sbostic     char  **argv;		/* Argument list			*/
42*54240Sbostic     char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
43*54240Sbostic     char   *wstart;		/* Beginning of next word		*/
44*54240Sbostic     char   *wspace;		/* Space of word buffer			*/
45*54240Sbostic     quote_t quote;		/* Quoting state			*/
46*54240Sbostic     int	    flags;		/* flags;				*/
47*54240Sbostic };
48*54240Sbostic 
49*54240Sbostic 
50*54240Sbostic private void tok_finish	__P((Tokenizer *));
51*54240Sbostic 
52*54240Sbostic 
53*54240Sbostic /* tok_finish():
54*54240Sbostic  *	Finish a word in the tokenizer.
55*54240Sbostic  */
56*54240Sbostic private void
57*54240Sbostic tok_finish(tok)
58*54240Sbostic     Tokenizer *tok;
59*54240Sbostic {
60*54240Sbostic     *tok->wptr = '\0';
61*54240Sbostic     if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
62*54240Sbostic 	tok->argv[tok->argc++] = tok->wstart;
63*54240Sbostic 	tok->argv[tok->argc] = NULL;
64*54240Sbostic 	tok->wstart = ++tok->wptr;
65*54240Sbostic     }
66*54240Sbostic     tok->flags &= ~TOK_KEEP;
67*54240Sbostic }
68*54240Sbostic 
69*54240Sbostic 
70*54240Sbostic /* tok_init():
71*54240Sbostic  *	Initialize the tokenizer
72*54240Sbostic  */
73*54240Sbostic public Tokenizer *
74*54240Sbostic tok_init(ifs)
75*54240Sbostic     const char *ifs;
76*54240Sbostic {
77*54240Sbostic     Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
78*54240Sbostic 
79*54240Sbostic     tok->ifs     = strdup(ifs ? ifs : IFS);
80*54240Sbostic     tok->argc    = 0;
81*54240Sbostic     tok->amax    = AINCR;
82*54240Sbostic     tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
83*54240Sbostic     tok->argv[0] = NULL;
84*54240Sbostic     tok->wspace  = (char *) tok_malloc(WINCR);
85*54240Sbostic     tok->wmax    = tok->wspace + WINCR;
86*54240Sbostic     tok->wstart  = tok->wspace;
87*54240Sbostic     tok->wptr    = tok->wspace;
88*54240Sbostic     tok->flags   = 0;
89*54240Sbostic     tok->quote   = Q_none;
90*54240Sbostic 
91*54240Sbostic     return tok;
92*54240Sbostic }
93*54240Sbostic 
94*54240Sbostic 
95*54240Sbostic /* tok_reset():
96*54240Sbostic  *	Reset the tokenizer
97*54240Sbostic  */
98*54240Sbostic public void
99*54240Sbostic tok_reset(tok)
100*54240Sbostic     Tokenizer *tok;
101*54240Sbostic {
102*54240Sbostic     tok->argc  = 0;
103*54240Sbostic     tok->wstart = tok->wspace;
104*54240Sbostic     tok->wptr = tok->wspace;
105*54240Sbostic     tok->flags = 0;
106*54240Sbostic     tok->quote = Q_none;
107*54240Sbostic }
108*54240Sbostic 
109*54240Sbostic 
110*54240Sbostic /* tok_end():
111*54240Sbostic  *	Clean up
112*54240Sbostic  */
113*54240Sbostic public void
114*54240Sbostic tok_end(tok)
115*54240Sbostic     Tokenizer *tok;
116*54240Sbostic {
117*54240Sbostic     tok_free((ptr_t) tok->ifs);
118*54240Sbostic     tok_free((ptr_t) tok->wspace);
119*54240Sbostic     tok_free((ptr_t) tok->argv);
120*54240Sbostic     tok_free((ptr_t) tok);
121*54240Sbostic }
122*54240Sbostic 
123*54240Sbostic 
124*54240Sbostic 
125*54240Sbostic /* tok_line():
126*54240Sbostic  *	Bourne shell like tokenizing
127*54240Sbostic  *	Return:
128*54240Sbostic  *		-1: Internal error
129*54240Sbostic  *		 3: Quoted return
130*54240Sbostic  *		 2: Unmatched double quote
131*54240Sbostic  *		 1: Unmatched single quote
132*54240Sbostic  *		 0: Ok
133*54240Sbostic  */
134*54240Sbostic public int
135*54240Sbostic tok_line(tok, line, argc, argv)
136*54240Sbostic     Tokenizer *tok;
137*54240Sbostic     const char* line;
138*54240Sbostic     int *argc;
139*54240Sbostic     char ***argv;
140*54240Sbostic {
141*54240Sbostic     const char *ptr;
142*54240Sbostic 
143*54240Sbostic     while (1) {
144*54240Sbostic 	switch (*(ptr = line++)) {
145*54240Sbostic 	case '\'':
146*54240Sbostic 	    tok->flags |= TOK_KEEP;
147*54240Sbostic 	    tok->flags &= ~TOK_EAT;
148*54240Sbostic 	    switch (tok->quote) {
149*54240Sbostic 	    case Q_none:
150*54240Sbostic 		tok->quote = Q_single;	/* Enter single quote mode */
151*54240Sbostic 		break;
152*54240Sbostic 
153*54240Sbostic 	    case Q_single:		/* Exit single quote mode */
154*54240Sbostic 		tok->quote = Q_none;
155*54240Sbostic 		break;
156*54240Sbostic 
157*54240Sbostic 	    case Q_one:			/* Quote this ' */
158*54240Sbostic 		tok->quote = Q_none;
159*54240Sbostic 		*tok->wptr++ = *ptr;
160*54240Sbostic 		break;
161*54240Sbostic 
162*54240Sbostic 	    case Q_double:		/* Stay in double quote mode */
163*54240Sbostic 		*tok->wptr++ = *ptr;
164*54240Sbostic 		break;
165*54240Sbostic 
166*54240Sbostic 	    case Q_doubleone:		/* Quote this ' */
167*54240Sbostic 		tok->quote = Q_double;
168*54240Sbostic 		*tok->wptr++ = *ptr;
169*54240Sbostic 		break;
170*54240Sbostic 
171*54240Sbostic 	    default:
172*54240Sbostic 		return(-1);
173*54240Sbostic 	    }
174*54240Sbostic 	    break;
175*54240Sbostic 
176*54240Sbostic 	case '"':
177*54240Sbostic 	    tok->flags &= ~TOK_EAT;
178*54240Sbostic 	    tok->flags |= TOK_KEEP;
179*54240Sbostic 	    switch (tok->quote) {
180*54240Sbostic 	    case Q_none:		/* Enter double quote mode */
181*54240Sbostic 		tok->quote = Q_double;
182*54240Sbostic 		break;
183*54240Sbostic 
184*54240Sbostic 	    case Q_double:
185*54240Sbostic 		tok->quote = Q_none;	/* Exit double quote mode */
186*54240Sbostic 		break;
187*54240Sbostic 
188*54240Sbostic 	    case Q_one:			/* Quote this " */
189*54240Sbostic 		tok->quote = Q_none;
190*54240Sbostic 		*tok->wptr++ = *ptr;
191*54240Sbostic 		break;
192*54240Sbostic 
193*54240Sbostic 	    case Q_single:		/* Stay in single quote mode */
194*54240Sbostic 		*tok->wptr++ = *ptr;
195*54240Sbostic 		break;
196*54240Sbostic 
197*54240Sbostic 	    case Q_doubleone:		/* Quote this " */
198*54240Sbostic 		tok->quote = Q_double;
199*54240Sbostic 		*tok->wptr++ = *ptr;
200*54240Sbostic 		break;
201*54240Sbostic 
202*54240Sbostic 	    default:
203*54240Sbostic 		return(-1);
204*54240Sbostic 	    }
205*54240Sbostic 	    break;
206*54240Sbostic 
207*54240Sbostic 	case '\\':
208*54240Sbostic 	    tok->flags |= TOK_KEEP;
209*54240Sbostic 	    tok->flags &= ~TOK_EAT;
210*54240Sbostic 	    switch (tok->quote) {
211*54240Sbostic 	    case Q_none:		/* Quote next character */
212*54240Sbostic 		tok->quote = Q_one;
213*54240Sbostic 		break;
214*54240Sbostic 
215*54240Sbostic 	    case Q_double:
216*54240Sbostic 		tok->quote = Q_doubleone;/* Quote next character */
217*54240Sbostic 		break;
218*54240Sbostic 
219*54240Sbostic 	    case Q_one:
220*54240Sbostic 		*tok->wptr++ = *ptr;
221*54240Sbostic 		tok->quote = Q_none;	/* Quote this, restore state */
222*54240Sbostic 		break;
223*54240Sbostic 
224*54240Sbostic 	    case Q_single:		/* Stay in single quote mode */
225*54240Sbostic 		*tok->wptr++ = *ptr;
226*54240Sbostic 		break;
227*54240Sbostic 
228*54240Sbostic 	    case Q_doubleone:		/* Quote this \ */
229*54240Sbostic 		tok->quote = Q_double;
230*54240Sbostic 		*tok->wptr++ = *ptr;
231*54240Sbostic 		break;
232*54240Sbostic 
233*54240Sbostic 	    default:
234*54240Sbostic 		return(-1);
235*54240Sbostic 	    }
236*54240Sbostic 	    break;
237*54240Sbostic 
238*54240Sbostic 	case '\n':
239*54240Sbostic 	    tok->flags &= ~TOK_EAT;
240*54240Sbostic 	    switch (tok->quote) {
241*54240Sbostic 	    case Q_none:
242*54240Sbostic 		tok_finish(tok);
243*54240Sbostic 		*argv = tok->argv;
244*54240Sbostic 		*argc = tok->argc;
245*54240Sbostic 		return(0);
246*54240Sbostic 
247*54240Sbostic 	    case Q_single:
248*54240Sbostic 	    case Q_double:
249*54240Sbostic 		*tok->wptr++ = *ptr;	/* Add the return		*/
250*54240Sbostic 		break;
251*54240Sbostic 
252*54240Sbostic 	    case Q_doubleone:
253*54240Sbostic 		tok->flags |= TOK_EAT;
254*54240Sbostic 		tok->quote = Q_double;	/* Back to double, eat the '\n' */
255*54240Sbostic 		break;
256*54240Sbostic 
257*54240Sbostic 	    case Q_one:
258*54240Sbostic 		tok->flags |= TOK_EAT;
259*54240Sbostic 		tok->quote = Q_none;	/* No quote, more eat the '\n' */
260*54240Sbostic 		break;
261*54240Sbostic 
262*54240Sbostic 	    default:
263*54240Sbostic 		return(0);
264*54240Sbostic 	    }
265*54240Sbostic 	    break;
266*54240Sbostic 
267*54240Sbostic 	case '\0':
268*54240Sbostic 	    switch (tok->quote) {
269*54240Sbostic 	    case Q_none:
270*54240Sbostic 		/* Finish word and return */
271*54240Sbostic 		if (tok->flags & TOK_EAT) {
272*54240Sbostic 		    tok->flags &= ~TOK_EAT;
273*54240Sbostic 		    return 3;
274*54240Sbostic 		}
275*54240Sbostic 		tok_finish(tok);
276*54240Sbostic 		*argv = tok->argv;
277*54240Sbostic 		*argc = tok->argc;
278*54240Sbostic 		return(0);
279*54240Sbostic 
280*54240Sbostic 	    case Q_single:
281*54240Sbostic 		return(1);
282*54240Sbostic 
283*54240Sbostic 	    case Q_double:
284*54240Sbostic 		return(2);
285*54240Sbostic 
286*54240Sbostic 	    case Q_doubleone:
287*54240Sbostic 		tok->quote = Q_double;
288*54240Sbostic 		*tok->wptr++ = *ptr;
289*54240Sbostic 		break;
290*54240Sbostic 
291*54240Sbostic 	    case Q_one:
292*54240Sbostic 		tok->quote = Q_none;
293*54240Sbostic 		*tok->wptr++ = *ptr;
294*54240Sbostic 		break;
295*54240Sbostic 
296*54240Sbostic 	    default:
297*54240Sbostic 		return(-1);
298*54240Sbostic 	    }
299*54240Sbostic 	    break;
300*54240Sbostic 
301*54240Sbostic 	default:
302*54240Sbostic 	    tok->flags &= ~TOK_EAT;
303*54240Sbostic 	    switch (tok->quote) {
304*54240Sbostic 	    case Q_none:
305*54240Sbostic 		if (strchr(tok->ifs, *ptr) != NULL)
306*54240Sbostic 		    tok_finish(tok);
307*54240Sbostic 		else
308*54240Sbostic 		    *tok->wptr++ = *ptr;
309*54240Sbostic 		break;
310*54240Sbostic 
311*54240Sbostic 	    case Q_single:
312*54240Sbostic 	    case Q_double:
313*54240Sbostic 		*tok->wptr++ = *ptr;
314*54240Sbostic 		break;
315*54240Sbostic 
316*54240Sbostic 
317*54240Sbostic 	    case Q_doubleone:
318*54240Sbostic 		*tok->wptr++ = '\\';
319*54240Sbostic 		tok->quote = Q_double;
320*54240Sbostic 		*tok->wptr++ = *ptr;
321*54240Sbostic 		break;
322*54240Sbostic 
323*54240Sbostic 	    case Q_one:
324*54240Sbostic 		tok->quote = Q_none;
325*54240Sbostic 		*tok->wptr++ = *ptr;
326*54240Sbostic 		break;
327*54240Sbostic 
328*54240Sbostic 	    default:
329*54240Sbostic 		return(-1);
330*54240Sbostic 
331*54240Sbostic 	    }
332*54240Sbostic 	    break;
333*54240Sbostic 	}
334*54240Sbostic 
335*54240Sbostic 	if (tok->wptr >= tok->wmax - 4) {
336*54240Sbostic 	    size_t size = tok->wmax - tok->wspace + WINCR;
337*54240Sbostic 	    char *s = (char *) tok_realloc(tok->wspace, size);
338*54240Sbostic 	    /*SUPPRESS 22*/
339*54240Sbostic 	    int offs = s - tok->wspace;
340*54240Sbostic 
341*54240Sbostic 	    if (offs != 0) {
342*54240Sbostic 		int i;
343*54240Sbostic 		for (i = 0; i < tok->argc; i++)
344*54240Sbostic 		    tok->argv[i] = tok->argv[i] + offs;
345*54240Sbostic 		tok->wptr   = tok->wptr + offs;
346*54240Sbostic 		tok->wstart = tok->wstart + offs;
347*54240Sbostic 		tok->wmax   = s + size;
348*54240Sbostic 		tok->wspace = s;
349*54240Sbostic 	    }
350*54240Sbostic 	}
351*54240Sbostic 
352*54240Sbostic 	if (tok->argc >= tok->amax - 4) {
353*54240Sbostic 	    tok->amax += AINCR;
354*54240Sbostic 	    tok->argv = (char **) tok_realloc(tok->argv,
355*54240Sbostic 					      tok->amax * sizeof(char*));
356*54240Sbostic 	}
357*54240Sbostic 
358*54240Sbostic     }
359*54240Sbostic }
360