xref: /netbsd-src/lib/libedit/tokenizer.c (revision 73704c4ce4ee2a60eb617e693ce7e9f03902613e)
1 /*	$NetBSD: tokenizer.c,v 1.13 2003/10/18 23:48:42 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 1992, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Christos Zoulas of Cornell University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "config.h"
36 #if !defined(lint) && !defined(SCCSID)
37 #if 0
38 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
39 #else
40 __RCSID("$NetBSD: tokenizer.c,v 1.13 2003/10/18 23:48:42 christos Exp $");
41 #endif
42 #endif /* not lint && not SCCSID */
43 
44 /*
45  * tokenize.c: Bourne shell like tokenizer
46  */
47 #include <string.h>
48 #include <stdlib.h>
49 #include "tokenizer.h"
50 
51 typedef enum {
52 	Q_none, Q_single, Q_double, Q_one, Q_doubleone
53 } quote_t;
54 
55 #define	IFS		"\t \n"
56 
57 #define	TOK_KEEP	1
58 #define	TOK_EAT		2
59 
60 #define	WINCR		20
61 #define	AINCR		10
62 
63 #define	tok_strdup(a)		strdup(a)
64 #define	tok_malloc(a)		malloc(a)
65 #define	tok_free(a)		free(a)
66 #define	tok_realloc(a, b)	realloc(a, b)
67 
68 
69 struct tokenizer {
70 	char	*ifs;		/* In field separator			 */
71 	int	 argc, amax;	/* Current and maximum number of args	 */
72 	char   **argv;		/* Argument list			 */
73 	char	*wptr, *wmax;	/* Space and limit on the word buffer	 */
74 	char	*wstart;	/* Beginning of next word		 */
75 	char	*wspace;	/* Space of word buffer			 */
76 	quote_t	 quote;		/* Quoting state			 */
77 	int	 flags;		/* flags;				 */
78 };
79 
80 
81 private void tok_finish(Tokenizer *);
82 
83 
84 /* tok_finish():
85  *	Finish a word in the tokenizer.
86  */
87 private void
88 tok_finish(Tokenizer *tok)
89 {
90 
91 	*tok->wptr = '\0';
92 	if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
93 		tok->argv[tok->argc++] = tok->wstart;
94 		tok->argv[tok->argc] = NULL;
95 		tok->wstart = ++tok->wptr;
96 	}
97 	tok->flags &= ~TOK_KEEP;
98 }
99 
100 
101 /* tok_init():
102  *	Initialize the tokenizer
103  */
104 public Tokenizer *
105 tok_init(const char *ifs)
106 {
107 	Tokenizer *tok = (Tokenizer *) tok_malloc(sizeof(Tokenizer));
108 
109 	if (tok == NULL)
110 		return NULL;
111 	tok->ifs = tok_strdup(ifs ? ifs : IFS);
112 	if (tok->ifs == NULL) {
113 		tok_free((ptr_t)tok);
114 		return NULL;
115 	}
116 	tok->argc = 0;
117 	tok->amax = AINCR;
118 	tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax);
119 	if (tok->argv == NULL) {
120 		tok_free((ptr_t)tok->ifs);
121 		tok_free((ptr_t)tok);
122 		return NULL;
123 	}
124 	tok->argv[0] = NULL;
125 	tok->wspace = (char *) tok_malloc(WINCR);
126 	if (tok->wspace == NULL) {
127 		tok_free((ptr_t)tok->argv);
128 		tok_free((ptr_t)tok->ifs);
129 		tok_free((ptr_t)tok);
130 		return NULL;
131 	}
132 	tok->wmax = tok->wspace + WINCR;
133 	tok->wstart = tok->wspace;
134 	tok->wptr = tok->wspace;
135 	tok->flags = 0;
136 	tok->quote = Q_none;
137 
138 	return (tok);
139 }
140 
141 
142 /* tok_reset():
143  *	Reset the tokenizer
144  */
145 public void
146 tok_reset(Tokenizer *tok)
147 {
148 
149 	tok->argc = 0;
150 	tok->wstart = tok->wspace;
151 	tok->wptr = tok->wspace;
152 	tok->flags = 0;
153 	tok->quote = Q_none;
154 }
155 
156 
157 /* tok_end():
158  *	Clean up
159  */
160 public void
161 tok_end(Tokenizer *tok)
162 {
163 
164 	tok_free((ptr_t) tok->ifs);
165 	tok_free((ptr_t) tok->wspace);
166 	tok_free((ptr_t) tok->argv);
167 	tok_free((ptr_t) tok);
168 }
169 
170 
171 
172 /* tok_line():
173  *	Bourne shell like tokenizing
174  *	Return:
175  *		-1: Internal error
176  *		 3: Quoted return
177  *		 2: Unmatched double quote
178  *		 1: Unmatched single quote
179  *		 0: Ok
180  */
181 public int
182 tok_line(Tokenizer *tok, const char *line, int *argc, const char ***argv)
183 {
184 	const char *ptr;
185 
186 	for (;;) {
187 		switch (*(ptr = line++)) {
188 		case '\'':
189 			tok->flags |= TOK_KEEP;
190 			tok->flags &= ~TOK_EAT;
191 			switch (tok->quote) {
192 			case Q_none:
193 				tok->quote = Q_single;	/* Enter single quote
194 							 * mode */
195 				break;
196 
197 			case Q_single:	/* Exit single quote mode */
198 				tok->quote = Q_none;
199 				break;
200 
201 			case Q_one:	/* Quote this ' */
202 				tok->quote = Q_none;
203 				*tok->wptr++ = *ptr;
204 				break;
205 
206 			case Q_double:	/* Stay in double quote mode */
207 				*tok->wptr++ = *ptr;
208 				break;
209 
210 			case Q_doubleone:	/* Quote this ' */
211 				tok->quote = Q_double;
212 				*tok->wptr++ = *ptr;
213 				break;
214 
215 			default:
216 				return (-1);
217 			}
218 			break;
219 
220 		case '"':
221 			tok->flags &= ~TOK_EAT;
222 			tok->flags |= TOK_KEEP;
223 			switch (tok->quote) {
224 			case Q_none:	/* Enter double quote mode */
225 				tok->quote = Q_double;
226 				break;
227 
228 			case Q_double:	/* Exit double quote mode */
229 				tok->quote = Q_none;
230 				break;
231 
232 			case Q_one:	/* Quote this " */
233 				tok->quote = Q_none;
234 				*tok->wptr++ = *ptr;
235 				break;
236 
237 			case Q_single:	/* Stay in single quote mode */
238 				*tok->wptr++ = *ptr;
239 				break;
240 
241 			case Q_doubleone:	/* Quote this " */
242 				tok->quote = Q_double;
243 				*tok->wptr++ = *ptr;
244 				break;
245 
246 			default:
247 				return (-1);
248 			}
249 			break;
250 
251 		case '\\':
252 			tok->flags |= TOK_KEEP;
253 			tok->flags &= ~TOK_EAT;
254 			switch (tok->quote) {
255 			case Q_none:	/* Quote next character */
256 				tok->quote = Q_one;
257 				break;
258 
259 			case Q_double:	/* Quote next character */
260 				tok->quote = Q_doubleone;
261 				break;
262 
263 			case Q_one:	/* Quote this, restore state */
264 				*tok->wptr++ = *ptr;
265 				tok->quote = Q_none;
266 				break;
267 
268 			case Q_single:	/* Stay in single quote mode */
269 				*tok->wptr++ = *ptr;
270 				break;
271 
272 			case Q_doubleone:	/* Quote this \ */
273 				tok->quote = Q_double;
274 				*tok->wptr++ = *ptr;
275 				break;
276 
277 			default:
278 				return (-1);
279 			}
280 			break;
281 
282 		case '\n':
283 			tok->flags &= ~TOK_EAT;
284 			switch (tok->quote) {
285 			case Q_none:
286 				tok_finish(tok);
287 				*argv = (const char **)tok->argv;
288 				*argc = tok->argc;
289 				return (0);
290 
291 			case Q_single:
292 			case Q_double:
293 				*tok->wptr++ = *ptr;	/* Add the return */
294 				break;
295 
296 			case Q_doubleone:   /* Back to double, eat the '\n' */
297 				tok->flags |= TOK_EAT;
298 				tok->quote = Q_double;
299 				break;
300 
301 			case Q_one:	/* No quote, more eat the '\n' */
302 				tok->flags |= TOK_EAT;
303 				tok->quote = Q_none;
304 				break;
305 
306 			default:
307 				return (0);
308 			}
309 			break;
310 
311 		case '\0':
312 			switch (tok->quote) {
313 			case Q_none:
314 				/* Finish word and return */
315 				if (tok->flags & TOK_EAT) {
316 					tok->flags &= ~TOK_EAT;
317 					return (3);
318 				}
319 				tok_finish(tok);
320 				*argv = (const char **)tok->argv;
321 				*argc = tok->argc;
322 				return (0);
323 
324 			case Q_single:
325 				return (1);
326 
327 			case Q_double:
328 				return (2);
329 
330 			case Q_doubleone:
331 				tok->quote = Q_double;
332 				*tok->wptr++ = *ptr;
333 				break;
334 
335 			case Q_one:
336 				tok->quote = Q_none;
337 				*tok->wptr++ = *ptr;
338 				break;
339 
340 			default:
341 				return (-1);
342 			}
343 			break;
344 
345 		default:
346 			tok->flags &= ~TOK_EAT;
347 			switch (tok->quote) {
348 			case Q_none:
349 				if (strchr(tok->ifs, *ptr) != NULL)
350 					tok_finish(tok);
351 				else
352 					*tok->wptr++ = *ptr;
353 				break;
354 
355 			case Q_single:
356 			case Q_double:
357 				*tok->wptr++ = *ptr;
358 				break;
359 
360 
361 			case Q_doubleone:
362 				*tok->wptr++ = '\\';
363 				tok->quote = Q_double;
364 				*tok->wptr++ = *ptr;
365 				break;
366 
367 			case Q_one:
368 				tok->quote = Q_none;
369 				*tok->wptr++ = *ptr;
370 				break;
371 
372 			default:
373 				return (-1);
374 
375 			}
376 			break;
377 		}
378 
379 		if (tok->wptr >= tok->wmax - 4) {
380 			size_t size = tok->wmax - tok->wspace + WINCR;
381 			char *s = (char *) tok_realloc(tok->wspace, size);
382 			if (s == NULL)
383 				return (-1);
384 
385 			if (s != tok->wspace) {
386 				int i;
387 				for (i = 0; i < tok->argc; i++) {
388 				    tok->argv[i] =
389 					(tok->argv[i] - tok->wspace) + s;
390 				}
391 				tok->wptr = (tok->wptr - tok->wspace) + s;
392 				tok->wstart = (tok->wstart - tok->wspace) + s;
393 				tok->wspace = s;
394 			}
395 			tok->wmax = s + size;
396 		}
397 		if (tok->argc >= tok->amax - 4) {
398 			char **p;
399 			tok->amax += AINCR;
400 			p = (char **) tok_realloc(tok->argv,
401 			    tok->amax * sizeof(char *));
402 			if (p == NULL)
403 				return (-1);
404 			tok->argv = p;
405 		}
406 	}
407 }
408