xref: /openbsd-src/lib/libedit/tokenizer.c (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1 /*	$OpenBSD: tokenizer.c,v 1.10 2003/11/25 20:12:38 otto Exp $	*/
2 /*	$NetBSD: tokenizer.c,v 1.13 2003/10/18 23:48:42 christos Exp $	*/
3 
4 /*-
5  * Copyright (c) 1992, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Christos Zoulas of Cornell University.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include "config.h"
37 #if !defined(lint) && !defined(SCCSID)
38 #if 0
39 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
40 #else
41 static const char rcsid[] = "$OpenBSD: tokenizer.c,v 1.10 2003/11/25 20:12:38 otto Exp $";
42 #endif
43 #endif /* not lint && not SCCSID */
44 
45 /*
46  * tokenize.c: Bourne shell like tokenizer
47  */
48 #include <string.h>
49 #include <stdlib.h>
50 #include "tokenizer.h"
51 
52 typedef enum {
53 	Q_none, Q_single, Q_double, Q_one, Q_doubleone
54 } quote_t;
55 
56 #define	IFS		"\t \n"
57 
58 #define	TOK_KEEP	1
59 #define	TOK_EAT		2
60 
61 #define	WINCR		20
62 #define	AINCR		10
63 
64 #define	tok_strdup(a)		strdup(a)
65 #define	tok_malloc(a)		malloc(a)
66 #define	tok_free(a)		free(a)
67 #define	tok_realloc(a, b)	realloc(a, b)
68 
69 
70 struct tokenizer {
71 	char	*ifs;		/* In field separator			 */
72 	int	 argc, amax;	/* Current and maximum number of args	 */
73 	char   **argv;		/* Argument list			 */
74 	char	*wptr, *wmax;	/* Space and limit on the word buffer	 */
75 	char	*wstart;	/* Beginning of next word		 */
76 	char	*wspace;	/* Space of word buffer			 */
77 	quote_t	 quote;		/* Quoting state			 */
78 	int	 flags;		/* flags;				 */
79 };
80 
81 
82 private void tok_finish(Tokenizer *);
83 
84 
85 /* tok_finish():
86  *	Finish a word in the tokenizer.
87  */
88 private void
89 tok_finish(Tokenizer *tok)
90 {
91 
92 	*tok->wptr = '\0';
93 	if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
94 		tok->argv[tok->argc++] = tok->wstart;
95 		tok->argv[tok->argc] = NULL;
96 		tok->wstart = ++tok->wptr;
97 	}
98 	tok->flags &= ~TOK_KEEP;
99 }
100 
101 
102 /* tok_init():
103  *	Initialize the tokenizer
104  */
105 public Tokenizer *
106 tok_init(const char *ifs)
107 {
108 	Tokenizer *tok = (Tokenizer *) tok_malloc(sizeof(Tokenizer));
109 
110 	if (tok == NULL)
111 		return NULL;
112 	tok->ifs = tok_strdup(ifs ? ifs : IFS);
113 	if (tok->ifs == NULL) {
114 		tok_free((ptr_t)tok);
115 		return NULL;
116 	}
117 	tok->argc = 0;
118 	tok->amax = AINCR;
119 	tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax);
120 	if (tok->argv == NULL) {
121 		tok_free((ptr_t)tok->ifs);
122 		tok_free((ptr_t)tok);
123 		return NULL;
124 	}
125 	tok->argv[0] = NULL;
126 	tok->wspace = (char *) tok_malloc(WINCR);
127 	if (tok->wspace == NULL) {
128 		tok_free((ptr_t)tok->argv);
129 		tok_free((ptr_t)tok->ifs);
130 		tok_free((ptr_t)tok);
131 		return NULL;
132 	}
133 	tok->wmax = tok->wspace + WINCR;
134 	tok->wstart = tok->wspace;
135 	tok->wptr = tok->wspace;
136 	tok->flags = 0;
137 	tok->quote = Q_none;
138 
139 	return (tok);
140 }
141 
142 
143 /* tok_reset():
144  *	Reset the tokenizer
145  */
146 public void
147 tok_reset(Tokenizer *tok)
148 {
149 
150 	tok->argc = 0;
151 	tok->wstart = tok->wspace;
152 	tok->wptr = tok->wspace;
153 	tok->flags = 0;
154 	tok->quote = Q_none;
155 }
156 
157 
158 /* tok_end():
159  *	Clean up
160  */
161 public void
162 tok_end(Tokenizer *tok)
163 {
164 
165 	tok_free((ptr_t) tok->ifs);
166 	tok_free((ptr_t) tok->wspace);
167 	tok_free((ptr_t) tok->argv);
168 	tok_free((ptr_t) tok);
169 }
170 
171 
172 
173 /* tok_line():
174  *	Bourne shell like tokenizing
175  *	Return:
176  *		-1: Internal error
177  *		 3: Quoted return
178  *		 2: Unmatched double quote
179  *		 1: Unmatched single quote
180  *		 0: Ok
181  */
182 public int
183 tok_line(Tokenizer *tok, const char *line, int *argc, const char ***argv)
184 {
185 	const char *ptr;
186 
187 	for (;;) {
188 		switch (*(ptr = line++)) {
189 		case '\'':
190 			tok->flags |= TOK_KEEP;
191 			tok->flags &= ~TOK_EAT;
192 			switch (tok->quote) {
193 			case Q_none:
194 				tok->quote = Q_single;	/* Enter single quote
195 							 * mode */
196 				break;
197 
198 			case Q_single:	/* Exit single quote mode */
199 				tok->quote = Q_none;
200 				break;
201 
202 			case Q_one:	/* Quote this ' */
203 				tok->quote = Q_none;
204 				*tok->wptr++ = *ptr;
205 				break;
206 
207 			case Q_double:	/* Stay in double quote mode */
208 				*tok->wptr++ = *ptr;
209 				break;
210 
211 			case Q_doubleone:	/* Quote this ' */
212 				tok->quote = Q_double;
213 				*tok->wptr++ = *ptr;
214 				break;
215 
216 			default:
217 				return (-1);
218 			}
219 			break;
220 
221 		case '"':
222 			tok->flags &= ~TOK_EAT;
223 			tok->flags |= TOK_KEEP;
224 			switch (tok->quote) {
225 			case Q_none:	/* Enter double quote mode */
226 				tok->quote = Q_double;
227 				break;
228 
229 			case Q_double:	/* Exit double quote mode */
230 				tok->quote = Q_none;
231 				break;
232 
233 			case Q_one:	/* Quote this " */
234 				tok->quote = Q_none;
235 				*tok->wptr++ = *ptr;
236 				break;
237 
238 			case Q_single:	/* Stay in single quote mode */
239 				*tok->wptr++ = *ptr;
240 				break;
241 
242 			case Q_doubleone:	/* Quote this " */
243 				tok->quote = Q_double;
244 				*tok->wptr++ = *ptr;
245 				break;
246 
247 			default:
248 				return (-1);
249 			}
250 			break;
251 
252 		case '\\':
253 			tok->flags |= TOK_KEEP;
254 			tok->flags &= ~TOK_EAT;
255 			switch (tok->quote) {
256 			case Q_none:	/* Quote next character */
257 				tok->quote = Q_one;
258 				break;
259 
260 			case Q_double:	/* Quote next character */
261 				tok->quote = Q_doubleone;
262 				break;
263 
264 			case Q_one:	/* Quote this, restore state */
265 				*tok->wptr++ = *ptr;
266 				tok->quote = Q_none;
267 				break;
268 
269 			case Q_single:	/* Stay in single quote mode */
270 				*tok->wptr++ = *ptr;
271 				break;
272 
273 			case Q_doubleone:	/* Quote this \ */
274 				tok->quote = Q_double;
275 				*tok->wptr++ = *ptr;
276 				break;
277 
278 			default:
279 				return (-1);
280 			}
281 			break;
282 
283 		case '\n':
284 			tok->flags &= ~TOK_EAT;
285 			switch (tok->quote) {
286 			case Q_none:
287 				tok_finish(tok);
288 				*argv = (const char **)tok->argv;
289 				*argc = tok->argc;
290 				return (0);
291 
292 			case Q_single:
293 			case Q_double:
294 				*tok->wptr++ = *ptr;	/* Add the return */
295 				break;
296 
297 			case Q_doubleone:   /* Back to double, eat the '\n' */
298 				tok->flags |= TOK_EAT;
299 				tok->quote = Q_double;
300 				break;
301 
302 			case Q_one:	/* No quote, more eat the '\n' */
303 				tok->flags |= TOK_EAT;
304 				tok->quote = Q_none;
305 				break;
306 
307 			default:
308 				return (0);
309 			}
310 			break;
311 
312 		case '\0':
313 			switch (tok->quote) {
314 			case Q_none:
315 				/* Finish word and return */
316 				if (tok->flags & TOK_EAT) {
317 					tok->flags &= ~TOK_EAT;
318 					return (3);
319 				}
320 				tok_finish(tok);
321 				*argv = (const char **)tok->argv;
322 				*argc = tok->argc;
323 				return (0);
324 
325 			case Q_single:
326 				return (1);
327 
328 			case Q_double:
329 				return (2);
330 
331 			case Q_doubleone:
332 				tok->quote = Q_double;
333 				*tok->wptr++ = *ptr;
334 				break;
335 
336 			case Q_one:
337 				tok->quote = Q_none;
338 				*tok->wptr++ = *ptr;
339 				break;
340 
341 			default:
342 				return (-1);
343 			}
344 			break;
345 
346 		default:
347 			tok->flags &= ~TOK_EAT;
348 			switch (tok->quote) {
349 			case Q_none:
350 				if (strchr(tok->ifs, *ptr) != NULL)
351 					tok_finish(tok);
352 				else
353 					*tok->wptr++ = *ptr;
354 				break;
355 
356 			case Q_single:
357 			case Q_double:
358 				*tok->wptr++ = *ptr;
359 				break;
360 
361 
362 			case Q_doubleone:
363 				*tok->wptr++ = '\\';
364 				tok->quote = Q_double;
365 				*tok->wptr++ = *ptr;
366 				break;
367 
368 			case Q_one:
369 				tok->quote = Q_none;
370 				*tok->wptr++ = *ptr;
371 				break;
372 
373 			default:
374 				return (-1);
375 
376 			}
377 			break;
378 		}
379 
380 		if (tok->wptr >= tok->wmax - 4) {
381 			size_t size = tok->wmax - tok->wspace + WINCR;
382 			char *s = (char *) tok_realloc(tok->wspace, size);
383 			if (s == NULL)
384 				return (-1);
385 
386 			if (s != tok->wspace) {
387 				int i;
388 				for (i = 0; i < tok->argc; i++) {
389 				    tok->argv[i] =
390 					(tok->argv[i] - tok->wspace) + s;
391 				}
392 				tok->wptr = (tok->wptr - tok->wspace) + s;
393 				tok->wstart = (tok->wstart - tok->wspace) + s;
394 				tok->wspace = s;
395 			}
396 			tok->wmax = s + size;
397 		}
398 		if (tok->argc >= tok->amax - 4) {
399 			char **p;
400 			tok->amax += AINCR;
401 			p = (char **) tok_realloc(tok->argv,
402 			    tok->amax * sizeof(char *));
403 			if (p == NULL)
404 				return (-1);
405 			tok->argv = p;
406 		}
407 	}
408 }
409