xref: /netbsd-src/lib/libedit/tokenizer.c (revision 2a399c6883d870daece976daec6ffa7bb7f934ce)
1 /*	$NetBSD: tokenizer.c,v 1.3 1997/07/06 18:25:37 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 1992, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Christos Zoulas of Cornell University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include <sys/cdefs.h>
40 #if !defined(lint) && !defined(SCCSID)
41 #if 0
42 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
43 #else
44 __RCSID("$NetBSD: tokenizer.c,v 1.3 1997/07/06 18:25:37 christos Exp $");
45 #endif
46 #endif /* not lint && not SCCSID */
47 
48 /*
49  * tokenize.c: Bourne shell like tokenizer
50  */
51 #include "sys.h"
52 #include <string.h>
53 #include <stdlib.h>
54 #include "tokenizer.h"
55 
56 typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
57 
58 #define IFS "\t \n"
59 
60 #define TOK_KEEP	1
61 #define TOK_EAT		2
62 
63 #define WINCR 20
64 #define AINCR 10
65 
66 #define tok_malloc(a)		malloc(a)
67 #define tok_free(a)		free(a)
68 #define tok_realloc(a, b)	realloc(a, b)
69 
70 
71 struct tokenizer {
72     char   *ifs;		/* In field separator			*/
73     int     argc, amax;		/* Current and maximum number of args	*/
74     char  **argv;		/* Argument list			*/
75     char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
76     char   *wstart;		/* Beginning of next word		*/
77     char   *wspace;		/* Space of word buffer			*/
78     quote_t quote;		/* Quoting state			*/
79     int	    flags;		/* flags;				*/
80 };
81 
82 
83 private void tok_finish	__P((Tokenizer *));
84 
85 
86 /* tok_finish():
87  *	Finish a word in the tokenizer.
88  */
89 private void
90 tok_finish(tok)
91     Tokenizer *tok;
92 {
93     *tok->wptr = '\0';
94     if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
95 	tok->argv[tok->argc++] = tok->wstart;
96 	tok->argv[tok->argc] = NULL;
97 	tok->wstart = ++tok->wptr;
98     }
99     tok->flags &= ~TOK_KEEP;
100 }
101 
102 
103 /* tok_init():
104  *	Initialize the tokenizer
105  */
106 public Tokenizer *
107 tok_init(ifs)
108     const char *ifs;
109 {
110     Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
111 
112     tok->ifs     = strdup(ifs ? ifs : IFS);
113     tok->argc    = 0;
114     tok->amax    = AINCR;
115     tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
116     tok->argv[0] = NULL;
117     tok->wspace  = (char *) tok_malloc(WINCR);
118     tok->wmax    = tok->wspace + WINCR;
119     tok->wstart  = tok->wspace;
120     tok->wptr    = tok->wspace;
121     tok->flags   = 0;
122     tok->quote   = Q_none;
123 
124     return tok;
125 }
126 
127 
128 /* tok_reset():
129  *	Reset the tokenizer
130  */
131 public void
132 tok_reset(tok)
133     Tokenizer *tok;
134 {
135     tok->argc  = 0;
136     tok->wstart = tok->wspace;
137     tok->wptr = tok->wspace;
138     tok->flags = 0;
139     tok->quote = Q_none;
140 }
141 
142 
143 /* tok_end():
144  *	Clean up
145  */
146 public void
147 tok_end(tok)
148     Tokenizer *tok;
149 {
150     tok_free((ptr_t) tok->ifs);
151     tok_free((ptr_t) tok->wspace);
152     tok_free((ptr_t) tok->argv);
153     tok_free((ptr_t) tok);
154 }
155 
156 
157 
158 /* tok_line():
159  *	Bourne shell like tokenizing
160  *	Return:
161  *		-1: Internal error
162  *		 3: Quoted return
163  *		 2: Unmatched double quote
164  *		 1: Unmatched single quote
165  *		 0: Ok
166  */
167 public int
168 tok_line(tok, line, argc, argv)
169     Tokenizer *tok;
170     const char* line;
171     int *argc;
172     char ***argv;
173 {
174     const char *ptr;
175 
176     while (1) {
177 	switch (*(ptr = line++)) {
178 	case '\'':
179 	    tok->flags |= TOK_KEEP;
180 	    tok->flags &= ~TOK_EAT;
181 	    switch (tok->quote) {
182 	    case Q_none:
183 		tok->quote = Q_single;	/* Enter single quote mode */
184 		break;
185 
186 	    case Q_single:		/* Exit single quote mode */
187 		tok->quote = Q_none;
188 		break;
189 
190 	    case Q_one:			/* Quote this ' */
191 		tok->quote = Q_none;
192 		*tok->wptr++ = *ptr;
193 		break;
194 
195 	    case Q_double:		/* Stay in double quote mode */
196 		*tok->wptr++ = *ptr;
197 		break;
198 
199 	    case Q_doubleone:		/* Quote this ' */
200 		tok->quote = Q_double;
201 		*tok->wptr++ = *ptr;
202 		break;
203 
204 	    default:
205 		return(-1);
206 	    }
207 	    break;
208 
209 	case '"':
210 	    tok->flags &= ~TOK_EAT;
211 	    tok->flags |= TOK_KEEP;
212 	    switch (tok->quote) {
213 	    case Q_none:		/* Enter double quote mode */
214 		tok->quote = Q_double;
215 		break;
216 
217 	    case Q_double:
218 		tok->quote = Q_none;	/* Exit double quote mode */
219 		break;
220 
221 	    case Q_one:			/* Quote this " */
222 		tok->quote = Q_none;
223 		*tok->wptr++ = *ptr;
224 		break;
225 
226 	    case Q_single:		/* Stay in single quote mode */
227 		*tok->wptr++ = *ptr;
228 		break;
229 
230 	    case Q_doubleone:		/* Quote this " */
231 		tok->quote = Q_double;
232 		*tok->wptr++ = *ptr;
233 		break;
234 
235 	    default:
236 		return(-1);
237 	    }
238 	    break;
239 
240 	case '\\':
241 	    tok->flags |= TOK_KEEP;
242 	    tok->flags &= ~TOK_EAT;
243 	    switch (tok->quote) {
244 	    case Q_none:		/* Quote next character */
245 		tok->quote = Q_one;
246 		break;
247 
248 	    case Q_double:
249 		tok->quote = Q_doubleone;/* Quote next character */
250 		break;
251 
252 	    case Q_one:
253 		*tok->wptr++ = *ptr;
254 		tok->quote = Q_none;	/* Quote this, restore state */
255 		break;
256 
257 	    case Q_single:		/* Stay in single quote mode */
258 		*tok->wptr++ = *ptr;
259 		break;
260 
261 	    case Q_doubleone:		/* Quote this \ */
262 		tok->quote = Q_double;
263 		*tok->wptr++ = *ptr;
264 		break;
265 
266 	    default:
267 		return(-1);
268 	    }
269 	    break;
270 
271 	case '\n':
272 	    tok->flags &= ~TOK_EAT;
273 	    switch (tok->quote) {
274 	    case Q_none:
275 		tok_finish(tok);
276 		*argv = tok->argv;
277 		*argc = tok->argc;
278 		return(0);
279 
280 	    case Q_single:
281 	    case Q_double:
282 		*tok->wptr++ = *ptr;	/* Add the return		*/
283 		break;
284 
285 	    case Q_doubleone:
286 		tok->flags |= TOK_EAT;
287 		tok->quote = Q_double;	/* Back to double, eat the '\n' */
288 		break;
289 
290 	    case Q_one:
291 		tok->flags |= TOK_EAT;
292 		tok->quote = Q_none;	/* No quote, more eat the '\n' */
293 		break;
294 
295 	    default:
296 		return(0);
297 	    }
298 	    break;
299 
300 	case '\0':
301 	    switch (tok->quote) {
302 	    case Q_none:
303 		/* Finish word and return */
304 		if (tok->flags & TOK_EAT) {
305 		    tok->flags &= ~TOK_EAT;
306 		    return 3;
307 		}
308 		tok_finish(tok);
309 		*argv = tok->argv;
310 		*argc = tok->argc;
311 		return(0);
312 
313 	    case Q_single:
314 		return(1);
315 
316 	    case Q_double:
317 		return(2);
318 
319 	    case Q_doubleone:
320 		tok->quote = Q_double;
321 		*tok->wptr++ = *ptr;
322 		break;
323 
324 	    case Q_one:
325 		tok->quote = Q_none;
326 		*tok->wptr++ = *ptr;
327 		break;
328 
329 	    default:
330 		return(-1);
331 	    }
332 	    break;
333 
334 	default:
335 	    tok->flags &= ~TOK_EAT;
336 	    switch (tok->quote) {
337 	    case Q_none:
338 		if (strchr(tok->ifs, *ptr) != NULL)
339 		    tok_finish(tok);
340 		else
341 		    *tok->wptr++ = *ptr;
342 		break;
343 
344 	    case Q_single:
345 	    case Q_double:
346 		*tok->wptr++ = *ptr;
347 		break;
348 
349 
350 	    case Q_doubleone:
351 		*tok->wptr++ = '\\';
352 		tok->quote = Q_double;
353 		*tok->wptr++ = *ptr;
354 		break;
355 
356 	    case Q_one:
357 		tok->quote = Q_none;
358 		*tok->wptr++ = *ptr;
359 		break;
360 
361 	    default:
362 		return(-1);
363 
364 	    }
365 	    break;
366 	}
367 
368 	if (tok->wptr >= tok->wmax - 4) {
369 	    size_t size = tok->wmax - tok->wspace + WINCR;
370 	    char *s = (char *) tok_realloc(tok->wspace, size);
371 	    /*SUPPRESS 22*/
372 	    int offs = s - tok->wspace;
373 
374 	    if (offs != 0) {
375 		int i;
376 		for (i = 0; i < tok->argc; i++)
377 		    tok->argv[i] = tok->argv[i] + offs;
378 		tok->wptr   = tok->wptr + offs;
379 		tok->wstart = tok->wstart + offs;
380 		tok->wmax   = s + size;
381 		tok->wspace = s;
382 	    }
383 	}
384 
385 	if (tok->argc >= tok->amax - 4) {
386 	    tok->amax += AINCR;
387 	    tok->argv = (char **) tok_realloc(tok->argv,
388 					      tok->amax * sizeof(char*));
389 	}
390 
391     }
392 }
393