xref: /openbsd-src/lib/libedit/tokenizer.c (revision db3296cf5c1dd9058ceecc3a29fe4aaa0bd26000)
1 /*	$OpenBSD: tokenizer.c,v 1.7 2003/06/02 20:18:40 millert Exp $	*/
2 /*	$NetBSD: tokenizer.c,v 1.2 1997/01/11 06:48:15 lukem Exp $	*/
3 
4 /*-
5  * Copyright (c) 1992, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Christos Zoulas of Cornell University.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if !defined(lint) && !defined(SCCSID)
37 #if 0
38 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
39 #else
40 static const char rcsid[] = "$OpenBSD: tokenizer.c,v 1.7 2003/06/02 20:18:40 millert Exp $";
41 #endif
42 #endif /* not lint && not SCCSID */
43 
44 /*
45  * tokenize.c: Bourne shell like tokenizer
46  */
47 #include "sys.h"
48 #include <string.h>
49 #include <stdlib.h>
50 #include "tokenizer.h"
51 
52 typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
53 
54 #define IFS "\t \n"
55 
56 #define TOK_KEEP	1
57 #define TOK_EAT		2
58 
59 #define WINCR 20
60 #define AINCR 10
61 
62 #define tok_malloc(a)		malloc(a)
63 #define tok_free(a)		free(a)
64 #define tok_realloc(a, b)	realloc(a, b)
65 
66 
67 struct tokenizer {
68     char   *ifs;		/* In field separator			*/
69     int     argc, amax;		/* Current and maximum number of args	*/
70     char  **argv;		/* Argument list			*/
71     char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
72     char   *wstart;		/* Beginning of next word		*/
73     char   *wspace;		/* Space of word buffer			*/
74     quote_t quote;		/* Quoting state			*/
75     int	    flags;		/* flags;				*/
76 };
77 
78 
79 private void tok_finish(Tokenizer *);
80 
81 
82 /* tok_finish():
83  *	Finish a word in the tokenizer.
84  */
85 private void
86 tok_finish(tok)
87     Tokenizer *tok;
88 {
89     *tok->wptr = '\0';
90     if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
91 	tok->argv[tok->argc++] = tok->wstart;
92 	tok->argv[tok->argc] = NULL;
93 	tok->wstart = ++tok->wptr;
94     }
95     tok->flags &= ~TOK_KEEP;
96 }
97 
98 
99 /* tok_init():
100  *	Initialize the tokenizer
101  */
102 public Tokenizer *
103 tok_init(ifs)
104     const char *ifs;
105 {
106     Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
107 
108     tok->ifs     = strdup(ifs ? ifs : IFS);
109     tok->argc    = 0;
110     tok->amax    = AINCR;
111     tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
112     tok->argv[0] = NULL;
113     tok->wspace  = (char *) tok_malloc(WINCR);
114     tok->wmax    = tok->wspace + WINCR;
115     tok->wstart  = tok->wspace;
116     tok->wptr    = tok->wspace;
117     tok->flags   = 0;
118     tok->quote   = Q_none;
119 
120     return tok;
121 }
122 
123 
124 /* tok_reset():
125  *	Reset the tokenizer
126  */
127 public void
128 tok_reset(tok)
129     Tokenizer *tok;
130 {
131     tok->argc  = 0;
132     tok->wstart = tok->wspace;
133     tok->wptr = tok->wspace;
134     tok->flags = 0;
135     tok->quote = Q_none;
136 }
137 
138 
139 /* tok_end():
140  *	Clean up
141  */
142 public void
143 tok_end(tok)
144     Tokenizer *tok;
145 {
146     tok_free((ptr_t) tok->ifs);
147     tok_free((ptr_t) tok->wspace);
148     tok_free((ptr_t) tok->argv);
149     tok_free((ptr_t) tok);
150 }
151 
152 
153 
154 /* tok_line():
155  *	Bourne shell like tokenizing
156  *	Return:
157  *		-1: Internal error
158  *		 3: Quoted return
159  *		 2: Unmatched double quote
160  *		 1: Unmatched single quote
161  *		 0: Ok
162  */
163 public int
164 tok_line(tok, line, argc, argv)
165     Tokenizer *tok;
166     const char* line;
167     int *argc;
168     char ***argv;
169 {
170     const char *ptr;
171 
172     while (1) {
173 	switch (*(ptr = line++)) {
174 	case '\'':
175 	    tok->flags |= TOK_KEEP;
176 	    tok->flags &= ~TOK_EAT;
177 	    switch (tok->quote) {
178 	    case Q_none:
179 		tok->quote = Q_single;	/* Enter single quote mode */
180 		break;
181 
182 	    case Q_single:		/* Exit single quote mode */
183 		tok->quote = Q_none;
184 		break;
185 
186 	    case Q_one:			/* Quote this ' */
187 		tok->quote = Q_none;
188 		*tok->wptr++ = *ptr;
189 		break;
190 
191 	    case Q_double:		/* Stay in double quote mode */
192 		*tok->wptr++ = *ptr;
193 		break;
194 
195 	    case Q_doubleone:		/* Quote this ' */
196 		tok->quote = Q_double;
197 		*tok->wptr++ = *ptr;
198 		break;
199 
200 	    default:
201 		return(-1);
202 	    }
203 	    break;
204 
205 	case '"':
206 	    tok->flags &= ~TOK_EAT;
207 	    tok->flags |= TOK_KEEP;
208 	    switch (tok->quote) {
209 	    case Q_none:		/* Enter double quote mode */
210 		tok->quote = Q_double;
211 		break;
212 
213 	    case Q_double:
214 		tok->quote = Q_none;	/* Exit double quote mode */
215 		break;
216 
217 	    case Q_one:			/* Quote this " */
218 		tok->quote = Q_none;
219 		*tok->wptr++ = *ptr;
220 		break;
221 
222 	    case Q_single:		/* Stay in single quote mode */
223 		*tok->wptr++ = *ptr;
224 		break;
225 
226 	    case Q_doubleone:		/* Quote this " */
227 		tok->quote = Q_double;
228 		*tok->wptr++ = *ptr;
229 		break;
230 
231 	    default:
232 		return(-1);
233 	    }
234 	    break;
235 
236 	case '\\':
237 	    tok->flags |= TOK_KEEP;
238 	    tok->flags &= ~TOK_EAT;
239 	    switch (tok->quote) {
240 	    case Q_none:		/* Quote next character */
241 		tok->quote = Q_one;
242 		break;
243 
244 	    case Q_double:
245 		tok->quote = Q_doubleone;/* Quote next character */
246 		break;
247 
248 	    case Q_one:
249 		*tok->wptr++ = *ptr;
250 		tok->quote = Q_none;	/* Quote this, restore state */
251 		break;
252 
253 	    case Q_single:		/* Stay in single quote mode */
254 		*tok->wptr++ = *ptr;
255 		break;
256 
257 	    case Q_doubleone:		/* Quote this \ */
258 		tok->quote = Q_double;
259 		*tok->wptr++ = *ptr;
260 		break;
261 
262 	    default:
263 		return(-1);
264 	    }
265 	    break;
266 
267 	case '\n':
268 	    tok->flags &= ~TOK_EAT;
269 	    switch (tok->quote) {
270 	    case Q_none:
271 		tok_finish(tok);
272 		*argv = tok->argv;
273 		*argc = tok->argc;
274 		return(0);
275 
276 	    case Q_single:
277 	    case Q_double:
278 		*tok->wptr++ = *ptr;	/* Add the return		*/
279 		break;
280 
281 	    case Q_doubleone:
282 		tok->flags |= TOK_EAT;
283 		tok->quote = Q_double;	/* Back to double, eat the '\n' */
284 		break;
285 
286 	    case Q_one:
287 		tok->flags |= TOK_EAT;
288 		tok->quote = Q_none;	/* No quote, more eat the '\n' */
289 		break;
290 
291 	    default:
292 		return(0);
293 	    }
294 	    break;
295 
296 	case '\0':
297 	    switch (tok->quote) {
298 	    case Q_none:
299 		/* Finish word and return */
300 		if (tok->flags & TOK_EAT) {
301 		    tok->flags &= ~TOK_EAT;
302 		    return 3;
303 		}
304 		tok_finish(tok);
305 		*argv = tok->argv;
306 		*argc = tok->argc;
307 		return(0);
308 
309 	    case Q_single:
310 		return(1);
311 
312 	    case Q_double:
313 		return(2);
314 
315 	    case Q_doubleone:
316 		tok->quote = Q_double;
317 		*tok->wptr++ = *ptr;
318 		break;
319 
320 	    case Q_one:
321 		tok->quote = Q_none;
322 		*tok->wptr++ = *ptr;
323 		break;
324 
325 	    default:
326 		return(-1);
327 	    }
328 	    break;
329 
330 	default:
331 	    tok->flags &= ~TOK_EAT;
332 	    switch (tok->quote) {
333 	    case Q_none:
334 		if (strchr(tok->ifs, *ptr) != NULL)
335 		    tok_finish(tok);
336 		else
337 		    *tok->wptr++ = *ptr;
338 		break;
339 
340 	    case Q_single:
341 	    case Q_double:
342 		*tok->wptr++ = *ptr;
343 		break;
344 
345 
346 	    case Q_doubleone:
347 		*tok->wptr++ = '\\';
348 		tok->quote = Q_double;
349 		*tok->wptr++ = *ptr;
350 		break;
351 
352 	    case Q_one:
353 		tok->quote = Q_none;
354 		*tok->wptr++ = *ptr;
355 		break;
356 
357 	    default:
358 		return(-1);
359 
360 	    }
361 	    break;
362 	}
363 
364 	if (tok->wptr >= tok->wmax - 4) {
365 	    size_t size = tok->wmax - tok->wspace + WINCR;
366 	    char *s = (char *) tok_realloc(tok->wspace, size);
367 	    /*SUPPRESS 22*/
368 	    int offs;
369 
370 	    if (s != NULL && (offs = s - tok->wspace) != 0) {
371 		int i;
372 		for (i = 0; i < tok->argc; i++)
373 		    tok->argv[i] = tok->argv[i] + offs;
374 		tok->wptr   = tok->wptr + offs;
375 		tok->wstart = tok->wstart + offs;
376 		tok->wmax   = s + size;
377 		tok->wspace = s;
378 	    }
379 	}
380 
381 	if (tok->argc >= tok->amax - 4) {
382 	    char **nargv = (char **) tok_realloc(tok->argv, (tok->amax + AINCR)
383 						 * sizeof(char*));
384 	    if (nargv != NULL) {
385 		tok->amax += AINCR;
386 		tok->argv = nargv;
387 	    }
388 	}
389     }
390 }
391