xref: /netbsd-src/lib/libedit/tokenizer.c (revision d9158b13b5dfe46201430699a3f7a235ecf28df3)
1 /*-
2  * Copyright (c) 1992, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Christos Zoulas of Cornell University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #if !defined(lint) && !defined(SCCSID)
38 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
39 #endif /* not lint && not SCCSID */
40 
41 /*
42  * tokenize.c: Bourne shell like tokenizer
43  */
44 #include "sys.h"
45 #include <string.h>
46 #include <stdlib.h>
47 #include "tokenizer.h"
48 
49 typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
50 
51 #define IFS "\t \n"
52 
53 #define TOK_KEEP	1
54 #define TOK_EAT		2
55 
56 #define WINCR 20
57 #define AINCR 10
58 
59 #define tok_malloc(a)		malloc(a)
60 #define tok_free(a)		free(a)
61 #define tok_realloc(a, b)	realloc(a, b)
62 
63 
64 struct tokenizer {
65     char   *ifs;		/* In field separator			*/
66     int     argc, amax;		/* Current and maximum number of args	*/
67     char  **argv;		/* Argument list			*/
68     char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
69     char   *wstart;		/* Beginning of next word		*/
70     char   *wspace;		/* Space of word buffer			*/
71     quote_t quote;		/* Quoting state			*/
72     int	    flags;		/* flags;				*/
73 };
74 
75 
76 private void tok_finish	__P((Tokenizer *));
77 
78 
79 /* tok_finish():
80  *	Finish a word in the tokenizer.
81  */
82 private void
83 tok_finish(tok)
84     Tokenizer *tok;
85 {
86     *tok->wptr = '\0';
87     if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
88 	tok->argv[tok->argc++] = tok->wstart;
89 	tok->argv[tok->argc] = NULL;
90 	tok->wstart = ++tok->wptr;
91     }
92     tok->flags &= ~TOK_KEEP;
93 }
94 
95 
96 /* tok_init():
97  *	Initialize the tokenizer
98  */
99 public Tokenizer *
100 tok_init(ifs)
101     const char *ifs;
102 {
103     Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
104 
105     tok->ifs     = strdup(ifs ? ifs : IFS);
106     tok->argc    = 0;
107     tok->amax    = AINCR;
108     tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
109     tok->argv[0] = NULL;
110     tok->wspace  = (char *) tok_malloc(WINCR);
111     tok->wmax    = tok->wspace + WINCR;
112     tok->wstart  = tok->wspace;
113     tok->wptr    = tok->wspace;
114     tok->flags   = 0;
115     tok->quote   = Q_none;
116 
117     return tok;
118 }
119 
120 
121 /* tok_reset():
122  *	Reset the tokenizer
123  */
124 public void
125 tok_reset(tok)
126     Tokenizer *tok;
127 {
128     tok->argc  = 0;
129     tok->wstart = tok->wspace;
130     tok->wptr = tok->wspace;
131     tok->flags = 0;
132     tok->quote = Q_none;
133 }
134 
135 
136 /* tok_end():
137  *	Clean up
138  */
139 public void
140 tok_end(tok)
141     Tokenizer *tok;
142 {
143     tok_free((ptr_t) tok->ifs);
144     tok_free((ptr_t) tok->wspace);
145     tok_free((ptr_t) tok->argv);
146     tok_free((ptr_t) tok);
147 }
148 
149 
150 
151 /* tok_line():
152  *	Bourne shell like tokenizing
153  *	Return:
154  *		-1: Internal error
155  *		 3: Quoted return
156  *		 2: Unmatched double quote
157  *		 1: Unmatched single quote
158  *		 0: Ok
159  */
160 public int
161 tok_line(tok, line, argc, argv)
162     Tokenizer *tok;
163     const char* line;
164     int *argc;
165     char ***argv;
166 {
167     const char *ptr;
168 
169     while (1) {
170 	switch (*(ptr = line++)) {
171 	case '\'':
172 	    tok->flags |= TOK_KEEP;
173 	    tok->flags &= ~TOK_EAT;
174 	    switch (tok->quote) {
175 	    case Q_none:
176 		tok->quote = Q_single;	/* Enter single quote mode */
177 		break;
178 
179 	    case Q_single:		/* Exit single quote mode */
180 		tok->quote = Q_none;
181 		break;
182 
183 	    case Q_one:			/* Quote this ' */
184 		tok->quote = Q_none;
185 		*tok->wptr++ = *ptr;
186 		break;
187 
188 	    case Q_double:		/* Stay in double quote mode */
189 		*tok->wptr++ = *ptr;
190 		break;
191 
192 	    case Q_doubleone:		/* Quote this ' */
193 		tok->quote = Q_double;
194 		*tok->wptr++ = *ptr;
195 		break;
196 
197 	    default:
198 		return(-1);
199 	    }
200 	    break;
201 
202 	case '"':
203 	    tok->flags &= ~TOK_EAT;
204 	    tok->flags |= TOK_KEEP;
205 	    switch (tok->quote) {
206 	    case Q_none:		/* Enter double quote mode */
207 		tok->quote = Q_double;
208 		break;
209 
210 	    case Q_double:
211 		tok->quote = Q_none;	/* Exit double quote mode */
212 		break;
213 
214 	    case Q_one:			/* Quote this " */
215 		tok->quote = Q_none;
216 		*tok->wptr++ = *ptr;
217 		break;
218 
219 	    case Q_single:		/* Stay in single quote mode */
220 		*tok->wptr++ = *ptr;
221 		break;
222 
223 	    case Q_doubleone:		/* Quote this " */
224 		tok->quote = Q_double;
225 		*tok->wptr++ = *ptr;
226 		break;
227 
228 	    default:
229 		return(-1);
230 	    }
231 	    break;
232 
233 	case '\\':
234 	    tok->flags |= TOK_KEEP;
235 	    tok->flags &= ~TOK_EAT;
236 	    switch (tok->quote) {
237 	    case Q_none:		/* Quote next character */
238 		tok->quote = Q_one;
239 		break;
240 
241 	    case Q_double:
242 		tok->quote = Q_doubleone;/* Quote next character */
243 		break;
244 
245 	    case Q_one:
246 		*tok->wptr++ = *ptr;
247 		tok->quote = Q_none;	/* Quote this, restore state */
248 		break;
249 
250 	    case Q_single:		/* Stay in single quote mode */
251 		*tok->wptr++ = *ptr;
252 		break;
253 
254 	    case Q_doubleone:		/* Quote this \ */
255 		tok->quote = Q_double;
256 		*tok->wptr++ = *ptr;
257 		break;
258 
259 	    default:
260 		return(-1);
261 	    }
262 	    break;
263 
264 	case '\n':
265 	    tok->flags &= ~TOK_EAT;
266 	    switch (tok->quote) {
267 	    case Q_none:
268 		tok_finish(tok);
269 		*argv = tok->argv;
270 		*argc = tok->argc;
271 		return(0);
272 
273 	    case Q_single:
274 	    case Q_double:
275 		*tok->wptr++ = *ptr;	/* Add the return		*/
276 		break;
277 
278 	    case Q_doubleone:
279 		tok->flags |= TOK_EAT;
280 		tok->quote = Q_double;	/* Back to double, eat the '\n' */
281 		break;
282 
283 	    case Q_one:
284 		tok->flags |= TOK_EAT;
285 		tok->quote = Q_none;	/* No quote, more eat the '\n' */
286 		break;
287 
288 	    default:
289 		return(0);
290 	    }
291 	    break;
292 
293 	case '\0':
294 	    switch (tok->quote) {
295 	    case Q_none:
296 		/* Finish word and return */
297 		if (tok->flags & TOK_EAT) {
298 		    tok->flags &= ~TOK_EAT;
299 		    return 3;
300 		}
301 		tok_finish(tok);
302 		*argv = tok->argv;
303 		*argc = tok->argc;
304 		return(0);
305 
306 	    case Q_single:
307 		return(1);
308 
309 	    case Q_double:
310 		return(2);
311 
312 	    case Q_doubleone:
313 		tok->quote = Q_double;
314 		*tok->wptr++ = *ptr;
315 		break;
316 
317 	    case Q_one:
318 		tok->quote = Q_none;
319 		*tok->wptr++ = *ptr;
320 		break;
321 
322 	    default:
323 		return(-1);
324 	    }
325 	    break;
326 
327 	default:
328 	    tok->flags &= ~TOK_EAT;
329 	    switch (tok->quote) {
330 	    case Q_none:
331 		if (strchr(tok->ifs, *ptr) != NULL)
332 		    tok_finish(tok);
333 		else
334 		    *tok->wptr++ = *ptr;
335 		break;
336 
337 	    case Q_single:
338 	    case Q_double:
339 		*tok->wptr++ = *ptr;
340 		break;
341 
342 
343 	    case Q_doubleone:
344 		*tok->wptr++ = '\\';
345 		tok->quote = Q_double;
346 		*tok->wptr++ = *ptr;
347 		break;
348 
349 	    case Q_one:
350 		tok->quote = Q_none;
351 		*tok->wptr++ = *ptr;
352 		break;
353 
354 	    default:
355 		return(-1);
356 
357 	    }
358 	    break;
359 	}
360 
361 	if (tok->wptr >= tok->wmax - 4) {
362 	    size_t size = tok->wmax - tok->wspace + WINCR;
363 	    char *s = (char *) tok_realloc(tok->wspace, size);
364 	    /*SUPPRESS 22*/
365 	    int offs = s - tok->wspace;
366 
367 	    if (offs != 0) {
368 		int i;
369 		for (i = 0; i < tok->argc; i++)
370 		    tok->argv[i] = tok->argv[i] + offs;
371 		tok->wptr   = tok->wptr + offs;
372 		tok->wstart = tok->wstart + offs;
373 		tok->wmax   = s + size;
374 		tok->wspace = s;
375 	    }
376 	}
377 
378 	if (tok->argc >= tok->amax - 4) {
379 	    tok->amax += AINCR;
380 	    tok->argv = (char **) tok_realloc(tok->argv,
381 					      tok->amax * sizeof(char*));
382 	}
383 
384     }
385 }
386