xref: /openbsd-src/lib/libedit/tokenizer.c (revision e5157e49389faebcb42b7237d55fbf096d9c2523)
1 /*	$OpenBSD: tokenizer.c,v 1.13 2014/10/17 06:07:50 deraadt Exp $	*/
2 /*	$NetBSD: tokenizer.c,v 1.18 2010/01/03 18:27:10 christos Exp $	*/
3 
4 /*-
5  * Copyright (c) 1992, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Christos Zoulas of Cornell University.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include "config.h"
37 
38 /* We build this file twice, once as NARROW, once as WIDE. */
39 /*
40  * tokenize.c: Bourne shell like tokenizer
41  */
42 #include <string.h>
43 #include <stdlib.h>
44 #include "histedit.h"
45 #include "chartype.h"
46 
47 typedef enum {
48 	Q_none, Q_single, Q_double, Q_one, Q_doubleone
49 } quote_t;
50 
51 #define	TOK_KEEP	1
52 #define	TOK_EAT		2
53 
54 #define	WINCR		20
55 #define	AINCR		10
56 
57 #define	IFS		STR("\t \n")
58 
59 #define	tok_strdup(a)		Strdup(a)
60 
61 
62 struct TYPE(tokenizer) {
63 	Char	*ifs;		/* In field separator			 */
64 	int	 argc, amax;	/* Current and maximum number of args	 */
65 	Char   **argv;		/* Argument list			 */
66 	Char	*wptr, *wmax;	/* Space and limit on the word buffer	 */
67 	Char	*wstart;	/* Beginning of next word		 */
68 	Char	*wspace;	/* Space of word buffer			 */
69 	quote_t	 quote;		/* Quoting state			 */
70 	int	 flags;		/* flags;				 */
71 };
72 
73 
74 private void FUN(tok,finish)(TYPE(Tokenizer) *);
75 
76 
77 /* FUN(tok,finish)():
78  *	Finish a word in the tokenizer.
79  */
80 private void
81 FUN(tok,finish)(TYPE(Tokenizer) *tok)
82 {
83 
84 	*tok->wptr = '\0';
85 	if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
86 		tok->argv[tok->argc++] = tok->wstart;
87 		tok->argv[tok->argc] = NULL;
88 		tok->wstart = ++tok->wptr;
89 	}
90 	tok->flags &= ~TOK_KEEP;
91 }
92 
93 
94 /* FUN(tok,init)():
95  *	Initialize the tokenizer
96  */
97 public TYPE(Tokenizer) *
98 FUN(tok,init)(const Char *ifs)
99 {
100 	TYPE(Tokenizer) *tok = malloc(sizeof(TYPE(Tokenizer)));
101 
102 	if (tok == NULL)
103 		return NULL;
104 	tok->ifs = tok_strdup(ifs ? ifs : IFS);
105 	if (tok->ifs == NULL) {
106 		free((ptr_t)tok);
107 		return NULL;
108 	}
109 	tok->argc = 0;
110 	tok->amax = AINCR;
111 	tok->argv = reallocarray(NULL, tok->amax, sizeof(*tok->argv));
112 	if (tok->argv == NULL) {
113 		free((ptr_t)tok->ifs);
114 		free((ptr_t)tok);
115 		return NULL;
116 	}
117 	tok->argv[0] = NULL;
118 	tok->wspace = reallocarray(NULL, WINCR, sizeof(*tok->wspace));
119 	if (tok->wspace == NULL) {
120 		free((ptr_t)tok->argv);
121 		free((ptr_t)tok->ifs);
122 		free((ptr_t)tok);
123 		return NULL;
124 	}
125 	tok->wmax = tok->wspace + WINCR;
126 	tok->wstart = tok->wspace;
127 	tok->wptr = tok->wspace;
128 	tok->flags = 0;
129 	tok->quote = Q_none;
130 
131 	return (tok);
132 }
133 
134 
135 /* FUN(tok,reset)():
136  *	Reset the tokenizer
137  */
138 public void
139 FUN(tok,reset)(TYPE(Tokenizer) *tok)
140 {
141 
142 	tok->argc = 0;
143 	tok->wstart = tok->wspace;
144 	tok->wptr = tok->wspace;
145 	tok->flags = 0;
146 	tok->quote = Q_none;
147 }
148 
149 
150 /* FUN(tok,end)():
151  *	Clean up
152  */
153 public void
154 FUN(tok,end)(TYPE(Tokenizer) *tok)
155 {
156 
157 	free((ptr_t) tok->ifs);
158 	free((ptr_t) tok->wspace);
159 	free((ptr_t) tok->argv);
160 	free((ptr_t) tok);
161 }
162 
163 
164 
165 /* FUN(tok,line)():
166  *	Bourne shell (sh(1)) like tokenizing
167  *	Arguments:
168  *		tok	current tokenizer state (setup with FUN(tok,init)())
169  *		line	line to parse
170  *	Returns:
171  *		-1	Internal error
172  *		 3	Quoted return
173  *		 2	Unmatched double quote
174  *		 1	Unmatched single quote
175  *		 0	Ok
176  *	Modifies (if return value is 0):
177  *		argc	number of arguments
178  *		argv	argument array
179  *		cursorc	if !NULL, argv element containing cursor
180  *		cursorv	if !NULL, offset in argv[cursorc] of cursor
181  */
182 public int
183 FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line,
184     int *argc, const Char ***argv, int *cursorc, int *cursoro)
185 {
186 	const Char *ptr;
187 	int cc, co;
188 
189 	cc = co = -1;
190 	ptr = line->buffer;
191 	for (ptr = line->buffer; ;ptr++) {
192 		if (ptr >= line->lastchar)
193 			ptr = STR("");
194 		if (ptr == line->cursor) {
195 			cc = tok->argc;
196 			co = (int)(tok->wptr - tok->wstart);
197 		}
198 		switch (*ptr) {
199 		case '\'':
200 			tok->flags |= TOK_KEEP;
201 			tok->flags &= ~TOK_EAT;
202 			switch (tok->quote) {
203 			case Q_none:
204 				tok->quote = Q_single;	/* Enter single quote
205 							 * mode */
206 				break;
207 
208 			case Q_single:	/* Exit single quote mode */
209 				tok->quote = Q_none;
210 				break;
211 
212 			case Q_one:	/* Quote this ' */
213 				tok->quote = Q_none;
214 				*tok->wptr++ = *ptr;
215 				break;
216 
217 			case Q_double:	/* Stay in double quote mode */
218 				*tok->wptr++ = *ptr;
219 				break;
220 
221 			case Q_doubleone:	/* Quote this ' */
222 				tok->quote = Q_double;
223 				*tok->wptr++ = *ptr;
224 				break;
225 
226 			default:
227 				return (-1);
228 			}
229 			break;
230 
231 		case '"':
232 			tok->flags &= ~TOK_EAT;
233 			tok->flags |= TOK_KEEP;
234 			switch (tok->quote) {
235 			case Q_none:	/* Enter double quote mode */
236 				tok->quote = Q_double;
237 				break;
238 
239 			case Q_double:	/* Exit double quote mode */
240 				tok->quote = Q_none;
241 				break;
242 
243 			case Q_one:	/* Quote this " */
244 				tok->quote = Q_none;
245 				*tok->wptr++ = *ptr;
246 				break;
247 
248 			case Q_single:	/* Stay in single quote mode */
249 				*tok->wptr++ = *ptr;
250 				break;
251 
252 			case Q_doubleone:	/* Quote this " */
253 				tok->quote = Q_double;
254 				*tok->wptr++ = *ptr;
255 				break;
256 
257 			default:
258 				return (-1);
259 			}
260 			break;
261 
262 		case '\\':
263 			tok->flags |= TOK_KEEP;
264 			tok->flags &= ~TOK_EAT;
265 			switch (tok->quote) {
266 			case Q_none:	/* Quote next character */
267 				tok->quote = Q_one;
268 				break;
269 
270 			case Q_double:	/* Quote next character */
271 				tok->quote = Q_doubleone;
272 				break;
273 
274 			case Q_one:	/* Quote this, restore state */
275 				*tok->wptr++ = *ptr;
276 				tok->quote = Q_none;
277 				break;
278 
279 			case Q_single:	/* Stay in single quote mode */
280 				*tok->wptr++ = *ptr;
281 				break;
282 
283 			case Q_doubleone:	/* Quote this \ */
284 				tok->quote = Q_double;
285 				*tok->wptr++ = *ptr;
286 				break;
287 
288 			default:
289 				return (-1);
290 			}
291 			break;
292 
293 		case '\n':
294 			tok->flags &= ~TOK_EAT;
295 			switch (tok->quote) {
296 			case Q_none:
297 				goto tok_line_outok;
298 
299 			case Q_single:
300 			case Q_double:
301 				*tok->wptr++ = *ptr;	/* Add the return */
302 				break;
303 
304 			case Q_doubleone:   /* Back to double, eat the '\n' */
305 				tok->flags |= TOK_EAT;
306 				tok->quote = Q_double;
307 				break;
308 
309 			case Q_one:	/* No quote, more eat the '\n' */
310 				tok->flags |= TOK_EAT;
311 				tok->quote = Q_none;
312 				break;
313 
314 			default:
315 				return (0);
316 			}
317 			break;
318 
319 		case '\0':
320 			switch (tok->quote) {
321 			case Q_none:
322 				/* Finish word and return */
323 				if (tok->flags & TOK_EAT) {
324 					tok->flags &= ~TOK_EAT;
325 					return (3);
326 				}
327 				goto tok_line_outok;
328 
329 			case Q_single:
330 				return (1);
331 
332 			case Q_double:
333 				return (2);
334 
335 			case Q_doubleone:
336 				tok->quote = Q_double;
337 				*tok->wptr++ = *ptr;
338 				break;
339 
340 			case Q_one:
341 				tok->quote = Q_none;
342 				*tok->wptr++ = *ptr;
343 				break;
344 
345 			default:
346 				return (-1);
347 			}
348 			break;
349 
350 		default:
351 			tok->flags &= ~TOK_EAT;
352 			switch (tok->quote) {
353 			case Q_none:
354 				if (Strchr(tok->ifs, *ptr) != NULL)
355 					FUN(tok,finish)(tok);
356 				else
357 					*tok->wptr++ = *ptr;
358 				break;
359 
360 			case Q_single:
361 			case Q_double:
362 				*tok->wptr++ = *ptr;
363 				break;
364 
365 
366 			case Q_doubleone:
367 				*tok->wptr++ = '\\';
368 				tok->quote = Q_double;
369 				*tok->wptr++ = *ptr;
370 				break;
371 
372 			case Q_one:
373 				tok->quote = Q_none;
374 				*tok->wptr++ = *ptr;
375 				break;
376 
377 			default:
378 				return (-1);
379 
380 			}
381 			break;
382 		}
383 
384 		if (tok->wptr >= tok->wmax - 4) {
385 			size_t size = tok->wmax - tok->wspace + WINCR;
386 			Char *s = reallocarray(tok->wspace, size, sizeof(*s));
387 			if (s == NULL)
388 				return (-1);
389 
390 			if (s != tok->wspace) {
391 				int i;
392 				for (i = 0; i < tok->argc; i++) {
393 				    tok->argv[i] =
394 					(tok->argv[i] - tok->wspace) + s;
395 				}
396 				tok->wptr = (tok->wptr - tok->wspace) + s;
397 				tok->wstart = (tok->wstart - tok->wspace) + s;
398 				tok->wspace = s;
399 			}
400 			tok->wmax = s + size;
401 		}
402 		if (tok->argc >= tok->amax - 4) {
403 			Char **p;
404 			tok->amax += AINCR;
405 			p = reallocarray(tok->argv, tok->amax, sizeof(*p));
406 			if (p == NULL)
407 				return (-1);
408 			tok->argv = p;
409 		}
410 	}
411  tok_line_outok:
412 	if (cc == -1 && co == -1) {
413 		cc = tok->argc;
414 		co = (int)(tok->wptr - tok->wstart);
415 	}
416 	if (cursorc != NULL)
417 		*cursorc = cc;
418 	if (cursoro != NULL)
419 		*cursoro = co;
420 	FUN(tok,finish)(tok);
421 	*argv = (const Char **)tok->argv;
422 	*argc = tok->argc;
423 	return (0);
424 }
425 
426 /* FUN(tok,str)():
427  *	Simpler version of tok_line, taking a NUL terminated line
428  *	and splitting into words, ignoring cursor state.
429  */
430 public int
431 FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc,
432     const Char ***argv)
433 {
434 	TYPE(LineInfo) li;
435 
436 	memset(&li, 0, sizeof(li));
437 	li.buffer = line;
438 	li.cursor = li.lastchar = Strchr(line, '\0');
439 	return (FUN(tok,line)(tok, &li, argc, argv, NULL, NULL));
440 }
441