xref: /netbsd-src/lib/libedit/tokenizer.c (revision d0fed6c87ddc40a8bffa6f99e7433ddfc864dd83)
1 /*	$NetBSD: tokenizer.c,v 1.2 1997/01/11 06:48:15 lukem Exp $	*/
2 
3 /*-
4  * Copyright (c) 1992, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Christos Zoulas of Cornell University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #if !defined(lint) && !defined(SCCSID)
40 #if 0
41 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
42 #else
43 static char rcsid[] = "$NetBSD: tokenizer.c,v 1.2 1997/01/11 06:48:15 lukem Exp $";
44 #endif
45 #endif /* not lint && not SCCSID */
46 
47 /*
48  * tokenize.c: Bourne shell like tokenizer
49  */
50 #include "sys.h"
51 #include <string.h>
52 #include <stdlib.h>
53 #include "tokenizer.h"
54 
55 typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
56 
57 #define IFS "\t \n"
58 
59 #define TOK_KEEP	1
60 #define TOK_EAT		2
61 
62 #define WINCR 20
63 #define AINCR 10
64 
65 #define tok_malloc(a)		malloc(a)
66 #define tok_free(a)		free(a)
67 #define tok_realloc(a, b)	realloc(a, b)
68 
69 
70 struct tokenizer {
71     char   *ifs;		/* In field separator			*/
72     int     argc, amax;		/* Current and maximum number of args	*/
73     char  **argv;		/* Argument list			*/
74     char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
75     char   *wstart;		/* Beginning of next word		*/
76     char   *wspace;		/* Space of word buffer			*/
77     quote_t quote;		/* Quoting state			*/
78     int	    flags;		/* flags;				*/
79 };
80 
81 
82 private void tok_finish	__P((Tokenizer *));
83 
84 
85 /* tok_finish():
86  *	Finish a word in the tokenizer.
87  */
88 private void
89 tok_finish(tok)
90     Tokenizer *tok;
91 {
92     *tok->wptr = '\0';
93     if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
94 	tok->argv[tok->argc++] = tok->wstart;
95 	tok->argv[tok->argc] = NULL;
96 	tok->wstart = ++tok->wptr;
97     }
98     tok->flags &= ~TOK_KEEP;
99 }
100 
101 
102 /* tok_init():
103  *	Initialize the tokenizer
104  */
105 public Tokenizer *
106 tok_init(ifs)
107     const char *ifs;
108 {
109     Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
110 
111     tok->ifs     = strdup(ifs ? ifs : IFS);
112     tok->argc    = 0;
113     tok->amax    = AINCR;
114     tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
115     tok->argv[0] = NULL;
116     tok->wspace  = (char *) tok_malloc(WINCR);
117     tok->wmax    = tok->wspace + WINCR;
118     tok->wstart  = tok->wspace;
119     tok->wptr    = tok->wspace;
120     tok->flags   = 0;
121     tok->quote   = Q_none;
122 
123     return tok;
124 }
125 
126 
127 /* tok_reset():
128  *	Reset the tokenizer
129  */
130 public void
131 tok_reset(tok)
132     Tokenizer *tok;
133 {
134     tok->argc  = 0;
135     tok->wstart = tok->wspace;
136     tok->wptr = tok->wspace;
137     tok->flags = 0;
138     tok->quote = Q_none;
139 }
140 
141 
142 /* tok_end():
143  *	Clean up
144  */
145 public void
146 tok_end(tok)
147     Tokenizer *tok;
148 {
149     tok_free((ptr_t) tok->ifs);
150     tok_free((ptr_t) tok->wspace);
151     tok_free((ptr_t) tok->argv);
152     tok_free((ptr_t) tok);
153 }
154 
155 
156 
157 /* tok_line():
158  *	Bourne shell like tokenizing
159  *	Return:
160  *		-1: Internal error
161  *		 3: Quoted return
162  *		 2: Unmatched double quote
163  *		 1: Unmatched single quote
164  *		 0: Ok
165  */
166 public int
167 tok_line(tok, line, argc, argv)
168     Tokenizer *tok;
169     const char* line;
170     int *argc;
171     char ***argv;
172 {
173     const char *ptr;
174 
175     while (1) {
176 	switch (*(ptr = line++)) {
177 	case '\'':
178 	    tok->flags |= TOK_KEEP;
179 	    tok->flags &= ~TOK_EAT;
180 	    switch (tok->quote) {
181 	    case Q_none:
182 		tok->quote = Q_single;	/* Enter single quote mode */
183 		break;
184 
185 	    case Q_single:		/* Exit single quote mode */
186 		tok->quote = Q_none;
187 		break;
188 
189 	    case Q_one:			/* Quote this ' */
190 		tok->quote = Q_none;
191 		*tok->wptr++ = *ptr;
192 		break;
193 
194 	    case Q_double:		/* Stay in double quote mode */
195 		*tok->wptr++ = *ptr;
196 		break;
197 
198 	    case Q_doubleone:		/* Quote this ' */
199 		tok->quote = Q_double;
200 		*tok->wptr++ = *ptr;
201 		break;
202 
203 	    default:
204 		return(-1);
205 	    }
206 	    break;
207 
208 	case '"':
209 	    tok->flags &= ~TOK_EAT;
210 	    tok->flags |= TOK_KEEP;
211 	    switch (tok->quote) {
212 	    case Q_none:		/* Enter double quote mode */
213 		tok->quote = Q_double;
214 		break;
215 
216 	    case Q_double:
217 		tok->quote = Q_none;	/* Exit double quote mode */
218 		break;
219 
220 	    case Q_one:			/* Quote this " */
221 		tok->quote = Q_none;
222 		*tok->wptr++ = *ptr;
223 		break;
224 
225 	    case Q_single:		/* Stay in single quote mode */
226 		*tok->wptr++ = *ptr;
227 		break;
228 
229 	    case Q_doubleone:		/* Quote this " */
230 		tok->quote = Q_double;
231 		*tok->wptr++ = *ptr;
232 		break;
233 
234 	    default:
235 		return(-1);
236 	    }
237 	    break;
238 
239 	case '\\':
240 	    tok->flags |= TOK_KEEP;
241 	    tok->flags &= ~TOK_EAT;
242 	    switch (tok->quote) {
243 	    case Q_none:		/* Quote next character */
244 		tok->quote = Q_one;
245 		break;
246 
247 	    case Q_double:
248 		tok->quote = Q_doubleone;/* Quote next character */
249 		break;
250 
251 	    case Q_one:
252 		*tok->wptr++ = *ptr;
253 		tok->quote = Q_none;	/* Quote this, restore state */
254 		break;
255 
256 	    case Q_single:		/* Stay in single quote mode */
257 		*tok->wptr++ = *ptr;
258 		break;
259 
260 	    case Q_doubleone:		/* Quote this \ */
261 		tok->quote = Q_double;
262 		*tok->wptr++ = *ptr;
263 		break;
264 
265 	    default:
266 		return(-1);
267 	    }
268 	    break;
269 
270 	case '\n':
271 	    tok->flags &= ~TOK_EAT;
272 	    switch (tok->quote) {
273 	    case Q_none:
274 		tok_finish(tok);
275 		*argv = tok->argv;
276 		*argc = tok->argc;
277 		return(0);
278 
279 	    case Q_single:
280 	    case Q_double:
281 		*tok->wptr++ = *ptr;	/* Add the return		*/
282 		break;
283 
284 	    case Q_doubleone:
285 		tok->flags |= TOK_EAT;
286 		tok->quote = Q_double;	/* Back to double, eat the '\n' */
287 		break;
288 
289 	    case Q_one:
290 		tok->flags |= TOK_EAT;
291 		tok->quote = Q_none;	/* No quote, more eat the '\n' */
292 		break;
293 
294 	    default:
295 		return(0);
296 	    }
297 	    break;
298 
299 	case '\0':
300 	    switch (tok->quote) {
301 	    case Q_none:
302 		/* Finish word and return */
303 		if (tok->flags & TOK_EAT) {
304 		    tok->flags &= ~TOK_EAT;
305 		    return 3;
306 		}
307 		tok_finish(tok);
308 		*argv = tok->argv;
309 		*argc = tok->argc;
310 		return(0);
311 
312 	    case Q_single:
313 		return(1);
314 
315 	    case Q_double:
316 		return(2);
317 
318 	    case Q_doubleone:
319 		tok->quote = Q_double;
320 		*tok->wptr++ = *ptr;
321 		break;
322 
323 	    case Q_one:
324 		tok->quote = Q_none;
325 		*tok->wptr++ = *ptr;
326 		break;
327 
328 	    default:
329 		return(-1);
330 	    }
331 	    break;
332 
333 	default:
334 	    tok->flags &= ~TOK_EAT;
335 	    switch (tok->quote) {
336 	    case Q_none:
337 		if (strchr(tok->ifs, *ptr) != NULL)
338 		    tok_finish(tok);
339 		else
340 		    *tok->wptr++ = *ptr;
341 		break;
342 
343 	    case Q_single:
344 	    case Q_double:
345 		*tok->wptr++ = *ptr;
346 		break;
347 
348 
349 	    case Q_doubleone:
350 		*tok->wptr++ = '\\';
351 		tok->quote = Q_double;
352 		*tok->wptr++ = *ptr;
353 		break;
354 
355 	    case Q_one:
356 		tok->quote = Q_none;
357 		*tok->wptr++ = *ptr;
358 		break;
359 
360 	    default:
361 		return(-1);
362 
363 	    }
364 	    break;
365 	}
366 
367 	if (tok->wptr >= tok->wmax - 4) {
368 	    size_t size = tok->wmax - tok->wspace + WINCR;
369 	    char *s = (char *) tok_realloc(tok->wspace, size);
370 	    /*SUPPRESS 22*/
371 	    int offs = s - tok->wspace;
372 
373 	    if (offs != 0) {
374 		int i;
375 		for (i = 0; i < tok->argc; i++)
376 		    tok->argv[i] = tok->argv[i] + offs;
377 		tok->wptr   = tok->wptr + offs;
378 		tok->wstart = tok->wstart + offs;
379 		tok->wmax   = s + size;
380 		tok->wspace = s;
381 	    }
382 	}
383 
384 	if (tok->argc >= tok->amax - 4) {
385 	    tok->amax += AINCR;
386 	    tok->argv = (char **) tok_realloc(tok->argv,
387 					      tok->amax * sizeof(char*));
388 	}
389 
390     }
391 }
392