xref: /netbsd-src/external/ibm-public/postfix/dist/src/global/tok822_parse.c (revision 41fbaed053f8fbfdf9d2a4ee0a7386a3c83f8505)
1 /*	$NetBSD: tok822_parse.c,v 1.1.1.1 2009/06/23 10:08:48 tron Exp $	*/
2 
3 /*++
4 /* NAME
5 /*	tok822_parse 3
6 /* SUMMARY
7 /*	RFC 822 address parser
8 /* SYNOPSIS
9 /*	#include <tok822.h>
10 /*
11 /*	TOK822 *tok822_scan_limit(str, tailp, limit)
12 /*	const char *str;
13 /*	TOK822	**tailp;
14 /*	int	limit;
15 /*
16 /*	TOK822 *tok822_scan(str, tailp)
17 /*	const char *str;
18 /*	TOK822	**tailp;
19 /*
20 /*	TOK822	*tok822_parse_limit(str, limit)
21 /*	const char *str;
22 /*	int	limit;
23 /*
24 /*	TOK822	*tok822_parse(str)
25 /*	const char *str;
26 /*
27 /*	TOK822	*tok822_scan_addr(str)
28 /*	const char *str;
29 /*
30 /*	VSTRING	*tok822_externalize(buffer, tree, flags)
31 /*	VSTRING	*buffer;
32 /*	TOK822	*tree;
33 /*	int	flags;
34 /*
35 /*	VSTRING	*tok822_internalize(buffer, tree, flags)
36 /*	VSTRING	*buffer;
37 /*	TOK822	*tree;
38 /*	int	flags;
39 /* DESCRIPTION
40 /*	This module converts address lists between string form and parse
41 /*	tree formats. The string form can appear in two different ways:
42 /*	external (or quoted) form, as used in message headers, and internal
43 /*	(unquoted) form, as used internally by the mail software.
44 /*	Although RFC 822 expects 7-bit data, these routines pay no
45 /*	special attention to 8-bit characters.
46 /*
47 /*	tok822_scan() converts the external-form string in \fIstr\fR
48 /*	to a linear token list. The \fItailp\fR argument is a null pointer
49 /*	or receives the pointer value of the last result list element.
50 /*
51 /*	tok822_scan_limit() implements tok822_scan(), which is a macro.
52 /*	The \fIlimit\fR argument is either zero or an upper bound on the
53 /*	number of tokens produced.
54 /*
55 /*	tok822_parse() converts the external-form address list in
56 /*	\fIstr\fR to the corresponding token tree. The parser is permissive
57 /*	and will not throw away information that it does not understand.
58 /*	The parser adds missing commas between addresses.
59 /*
60 /*	tok822_parse_limit() implements tok822_parse(), which is a macro.
61 /*	The \fIlimit\fR argument is either zero or an upper bound on the
62 /*	number of tokens produced.
63 /*
64 /*	tok822_scan_addr() converts the external-form string in
65 /*	\fIstr\fR to an address token tree. This is just string to
66 /*	token list conversion; no parsing is done. This routine is
67 /*	suitable for data that should contain just one address and no
68 /*	other information.
69 /*
70 /*	tok822_externalize() converts a token list to external form.
71 /*	Where appropriate, characters and strings are quoted and white
72 /*	space is inserted. The \fIflags\fR argument is the binary OR of
73 /*	zero or more of the following:
74 /* .IP TOK822_STR_WIPE
75 /*	Initially, truncate the result to zero length.
76 /* .IP TOK822_STR_TERM
77 /*	Append a null terminator to the result when done.
78 /* .IP TOK822_STR_LINE
79 /*	Append a line break after each comma token, instead of appending
80 /*	whitespace.  It is up to the caller to concatenate short lines to
81 /*	produce longer ones.
82 /* .IP TOK822_STR_TRNC
83 /*	Truncate non-address information to 250 characters per address, to
84 /*	protect Sendmail systems that are vulnerable to the problem in CERT
85 /*	advisory CA-2003-07.
86 /*	This flag has effect with tok822_externalize() only.
87 /* .PP
88 /*	The macro TOK_822_NONE expresses that none of the above features
89 /*	should be activated.
90 /*
91 /*	The macro TOK822_STR_DEFL combines the TOK822_STR_WIPE and
92 /*	TOK822_STR_TERM flags. This is useful for most token to string
93 /*	conversions.
94 /*
95 /*	The macro TOK822_STR_HEAD combines the TOK822_STR_TERM,
96 /*	TOK822_STR_LINE and TOK822_STR_TRNC flags. This is useful for
97 /*	the special case of token to mail header conversion.
98 /*
99 /*	tok822_internalize() converts a token list to string form,
100 /*	without quoting. White space is inserted where appropriate.
101 /*	The \fIflags\fR argument is as with tok822_externalize().
102 /* STANDARDS
103 /* .ad
104 /* .fi
105 /*	RFC 822 (ARPA Internet Text Messages). In addition to this standard
106 /*	this module implements additional operators such as % and !. These
107 /*	are needed because the real world is not all RFC 822. Also, the ':'
108 /*	operator is allowed to appear inside addresses, to accommodate DECnet.
109 /*	In addition, 8-bit data is not given special treatment.
110 /* LICENSE
111 /* .ad
112 /* .fi
113 /*	The Secure Mailer license must be distributed with this software.
114 /* AUTHOR(S)
115 /*	Wietse Venema
116 /*	IBM T.J. Watson Research
117 /*	P.O. Box 704
118 /*	Yorktown Heights, NY 10598, USA
119 /*--*/
120 
121 /* System library. */
122 
123 #include <sys_defs.h>
124 #include <ctype.h>
125 #include <string.h>
126 
127 /* Utility library. */
128 
129 #include <vstring.h>
130 #include <msg.h>
131 #include <stringops.h>
132 
133 /* Global library. */
134 
135 #include "lex_822.h"
136 #include "quote_822_local.h"
137 #include "tok822.h"
138 
139  /*
140   * I suppose this is my favorite macro. Used heavily for tokenizing.
141   */
142 #define COLLECT(t,s,c,cond) { \
143 	while ((c = *(unsigned char *) s) != 0) { \
144 	    if (c == '\\') { \
145 		if ((c = *(unsigned char *)++s) == 0) \
146 		    break; \
147 	    } else if (!(cond)) { \
148 		break; \
149 	    } \
150 	    VSTRING_ADDCH(t->vstr, IS_SPACE_TAB_CR_LF(c) ? ' ' : c); \
151 	    s++; \
152 	} \
153 	VSTRING_TERMINATE(t->vstr); \
154     }
155 
156 #define COLLECT_SKIP_LAST(t,s,c,cond) { COLLECT(t,s,c,cond); if (*s) s++; }
157 
158  /*
159   * Not quite as complex. The parser depends heavily on it.
160   */
161 #define SKIP(tp, cond) { \
162 	while (tp->type && (cond)) \
163 	    tp = tp->prev; \
164     }
165 
166 #define MOVE_COMMENT_AND_CONTINUE(tp, right) { \
167 	TOK822 *prev = tok822_unlink(tp); \
168 	right = tok822_prepend(right, tp); \
169 	tp = prev; \
170 	continue; \
171     }
172 
173 #define SKIP_MOVE_COMMENT(tp, cond, right) { \
174 	while (tp->type && (cond)) { \
175 	    if (tp->type == TOK822_COMMENT) \
176 		MOVE_COMMENT_AND_CONTINUE(tp, right); \
177 	    tp = tp->prev; \
178 	} \
179     }
180 
181  /*
182   * Single-character operators. We include the % and ! operators because not
183   * all the world is RFC822. XXX Make this operator list configurable when we
184   * have a real rewriting language. Include | for aliases file parsing.
185   */
186 static char tok822_opchar[] = "|%!" LEX_822_SPECIALS;
187 static void tok822_quote_atom(TOK822 *);
188 static const char *tok822_comment(TOK822 *, const char *);
189 static TOK822 *tok822_group(int, TOK822 *, TOK822 *, int);
190 static void tok822_copy_quoted(VSTRING *, char *, char *);
191 static int tok822_append_space(TOK822 *);
192 
193 #define DO_WORD		(1<<0)		/* finding a word is ok here */
194 #define DO_GROUP	(1<<1)		/* doing an address group */
195 
196 #define ADD_COMMA	','		/* resynchronize */
197 #define NO_MISSING_COMMA 0
198 
199 /* tok822_internalize - token tree to string, internal form */
200 
tok822_internalize(VSTRING * vp,TOK822 * tree,int flags)201 VSTRING *tok822_internalize(VSTRING *vp, TOK822 *tree, int flags)
202 {
203     TOK822 *tp;
204 
205     if (flags & TOK822_STR_WIPE)
206 	VSTRING_RESET(vp);
207 
208     for (tp = tree; tp; tp = tp->next) {
209 	switch (tp->type) {
210 	case ',':
211 	    VSTRING_ADDCH(vp, tp->type);
212 	    if (flags & TOK822_STR_LINE) {
213 		VSTRING_ADDCH(vp, '\n');
214 		continue;
215 	    }
216 	    break;
217 	case TOK822_ADDR:
218 	    tok822_internalize(vp, tp->head, TOK822_STR_NONE);
219 	    break;
220 	case TOK822_COMMENT:
221 	case TOK822_ATOM:
222 	case TOK822_QSTRING:
223 	    vstring_strcat(vp, vstring_str(tp->vstr));
224 	    break;
225 	case TOK822_DOMLIT:
226 	    VSTRING_ADDCH(vp, '[');
227 	    vstring_strcat(vp, vstring_str(tp->vstr));
228 	    VSTRING_ADDCH(vp, ']');
229 	    break;
230 	case TOK822_STARTGRP:
231 	    VSTRING_ADDCH(vp, ':');
232 	    break;
233 	default:
234 	    if (tp->type >= TOK822_MINTOK)
235 		msg_panic("tok822_internalize: unknown operator %d", tp->type);
236 	    VSTRING_ADDCH(vp, tp->type);
237 	}
238 	if (tok822_append_space(tp))
239 	    VSTRING_ADDCH(vp, ' ');
240     }
241     if (flags & TOK822_STR_TERM)
242 	VSTRING_TERMINATE(vp);
243     return (vp);
244 }
245 
246 /* strip_address - strip non-address text from address expression */
247 
strip_address(VSTRING * vp,ssize_t start,TOK822 * addr)248 static void strip_address(VSTRING *vp, ssize_t start, TOK822 *addr)
249 {
250     VSTRING *tmp;
251 
252     /*
253      * Emit plain <address>. Discard any comments or phrases.
254      */
255     VSTRING_TERMINATE(vp);
256     msg_warn("stripping too many comments from address: %.100s...",
257 	     printable(vstring_str(vp) + start, '?'));
258     vstring_truncate(vp, start);
259     VSTRING_ADDCH(vp, '<');
260     if (addr) {
261 	tmp = vstring_alloc(100);
262 	tok822_internalize(tmp, addr, TOK822_STR_TERM);
263 	quote_822_local_flags(vp, vstring_str(tmp),
264 			      QUOTE_FLAG_8BITCLEAN | QUOTE_FLAG_APPEND);
265 	vstring_free(tmp);
266     }
267     VSTRING_ADDCH(vp, '>');
268 }
269 
270 /* tok822_externalize - token tree to string, external form */
271 
tok822_externalize(VSTRING * vp,TOK822 * tree,int flags)272 VSTRING *tok822_externalize(VSTRING *vp, TOK822 *tree, int flags)
273 {
274     VSTRING *tmp;
275     TOK822 *tp;
276     ssize_t start;
277     TOK822 *addr;
278     ssize_t addr_len;
279 
280     /*
281      * Guard against a Sendmail buffer overflow (CERT advisory CA-2003-07).
282      * The problem was that Sendmail could store too much non-address text
283      * (comments, phrases, etc.) into a static 256-byte buffer.
284      *
285      * When the buffer fills up, fixed Sendmail versions remove comments etc.
286      * and reduce the information to just <$g>, which expands to <address>.
287      * No change is made when an address expression (text separated by
288      * commas) contains no address. This fix reportedly also protects
289      * Sendmail systems that are still vulnerable to this problem.
290      *
291      * Postfix takes the same approach, grudgingly. To avoid unnecessary damage,
292      * Postfix removes comments etc. only when the amount of non-address text
293      * in an address expression (text separated by commas) exceeds 250 bytes.
294      *
295      * With Sendmail, the address part of an address expression is the
296      * right-most <> instance in that expression. If an address expression
297      * contains no <>, then Postfix guarantees that it contains at most one
298      * non-comment string; that string is the address part of the address
299      * expression, so there is no ambiguity.
300      *
301      * Finally, we note that stress testing shows that other code in Sendmail
302      * 8.12.8 bluntly truncates ``text <address>'' to 256 bytes even when
303      * this means chopping the <address> somewhere in the middle. This is a
304      * loss of control that we're not entirely comfortable with. However,
305      * unbalanced quotes and dangling backslash do not seem to influence the
306      * way that Sendmail parses headers, so this is not an urgent problem.
307      */
308 #define MAX_NONADDR_LENGTH 250
309 
310 #define RESET_NONADDR_LENGTH { \
311 	start = VSTRING_LEN(vp); \
312 	addr = 0; \
313 	addr_len = 0; \
314     }
315 
316 #define ENFORCE_NONADDR_LENGTH do { \
317 	if (addr && VSTRING_LEN(vp) - addr_len > start + MAX_NONADDR_LENGTH) \
318 	    strip_address(vp, start, addr->head); \
319     } while(0)
320 
321     if (flags & TOK822_STR_WIPE)
322 	VSTRING_RESET(vp);
323 
324     if (flags & TOK822_STR_TRNC)
325 	RESET_NONADDR_LENGTH;
326 
327     for (tp = tree; tp; tp = tp->next) {
328 	switch (tp->type) {
329 	case ',':
330 	    if (flags & TOK822_STR_TRNC)
331 		ENFORCE_NONADDR_LENGTH;
332 	    VSTRING_ADDCH(vp, tp->type);
333 	    VSTRING_ADDCH(vp, (flags & TOK822_STR_LINE) ? '\n' : ' ');
334 	    if (flags & TOK822_STR_TRNC)
335 		RESET_NONADDR_LENGTH;
336 	    continue;
337 
338 	    /*
339 	     * XXX In order to correctly externalize an address, it is not
340 	     * sufficient to quote individual atoms. There are higher-level
341 	     * rules that say when an address localpart needs to be quoted.
342 	     * We wing it with the quote_822_local() routine, which ignores
343 	     * the issue of atoms in the domain part that would need quoting.
344 	     */
345 	case TOK822_ADDR:
346 	    addr = tp;
347 	    tmp = vstring_alloc(100);
348 	    tok822_internalize(tmp, tp->head, TOK822_STR_TERM);
349 	    addr_len = VSTRING_LEN(vp);
350 	    quote_822_local_flags(vp, vstring_str(tmp),
351 				  QUOTE_FLAG_8BITCLEAN | QUOTE_FLAG_APPEND);
352 	    addr_len = VSTRING_LEN(vp) - addr_len;
353 	    vstring_free(tmp);
354 	    break;
355 	case TOK822_ATOM:
356 	case TOK822_COMMENT:
357 	    vstring_strcat(vp, vstring_str(tp->vstr));
358 	    break;
359 	case TOK822_QSTRING:
360 	    VSTRING_ADDCH(vp, '"');
361 	    tok822_copy_quoted(vp, vstring_str(tp->vstr), "\"\\\r\n");
362 	    VSTRING_ADDCH(vp, '"');
363 	    break;
364 	case TOK822_DOMLIT:
365 	    VSTRING_ADDCH(vp, '[');
366 	    tok822_copy_quoted(vp, vstring_str(tp->vstr), "\\\r\n");
367 	    VSTRING_ADDCH(vp, ']');
368 	    break;
369 	case TOK822_STARTGRP:
370 	    VSTRING_ADDCH(vp, ':');
371 	    break;
372 	case '<':
373 	    if (tp->next && tp->next->type == '>') {
374 		addr = tp;
375 		addr_len = 0;
376 	    }
377 	    VSTRING_ADDCH(vp, '<');
378 	    break;
379 	default:
380 	    if (tp->type >= TOK822_MINTOK)
381 		msg_panic("tok822_externalize: unknown operator %d", tp->type);
382 	    VSTRING_ADDCH(vp, tp->type);
383 	}
384 	if (tok822_append_space(tp))
385 	    VSTRING_ADDCH(vp, ' ');
386     }
387     if (flags & TOK822_STR_TRNC)
388 	ENFORCE_NONADDR_LENGTH;
389 
390     if (flags & TOK822_STR_TERM)
391 	VSTRING_TERMINATE(vp);
392     return (vp);
393 }
394 
395 /* tok822_copy_quoted - copy a string while quoting */
396 
tok822_copy_quoted(VSTRING * vp,char * str,char * quote_set)397 static void tok822_copy_quoted(VSTRING *vp, char *str, char *quote_set)
398 {
399     int     ch;
400 
401     while ((ch = *(unsigned char *) str++) != 0) {
402 	if (strchr(quote_set, ch))
403 	    VSTRING_ADDCH(vp, '\\');
404 	VSTRING_ADDCH(vp, ch);
405     }
406 }
407 
408 /* tok822_append_space - see if space is needed after this token */
409 
tok822_append_space(TOK822 * tp)410 static int tok822_append_space(TOK822 *tp)
411 {
412     TOK822 *next;
413 
414     if (tp == 0 || (next = tp->next) == 0 || tp->owner != 0)
415 	return (0);
416     if (tp->type == ',' || tp->type == TOK822_STARTGRP || next->type == '<')
417 	return (1);
418 
419 #define NON_OPERATOR(x) \
420     (x->type == TOK822_ATOM || x->type == TOK822_QSTRING \
421      || x->type == TOK822_COMMENT || x->type == TOK822_DOMLIT \
422      || x->type == TOK822_ADDR)
423 
424     return (NON_OPERATOR(tp) && NON_OPERATOR(next));
425 }
426 
427 /* tok822_scan_limit - tokenize string */
428 
tok822_scan_limit(const char * str,TOK822 ** tailp,int tok_count_limit)429 TOK822 *tok822_scan_limit(const char *str, TOK822 **tailp, int tok_count_limit)
430 {
431     TOK822 *head = 0;
432     TOK822 *tail = 0;
433     TOK822 *tp;
434     int     ch;
435     int     tok_count = 0;
436 
437     /*
438      * XXX 2822 new feature: Section 4.1 allows "." to appear in a phrase (to
439      * allow for forms such as: Johnny B. Goode <johhny@domain.org>. I cannot
440      * handle that at the tokenizer level - it is not context sensitive. And
441      * to fix this at the parser level requires radical changes to preserve
442      * white space as part of the token stream. Thanks a lot, people.
443      */
444     while ((ch = *(unsigned char *) str++) != 0) {
445 	if (IS_SPACE_TAB_CR_LF(ch))
446 	    continue;
447 	if (ch == '(') {
448 	    tp = tok822_alloc(TOK822_COMMENT, (char *) 0);
449 	    str = tok822_comment(tp, str);
450 	} else if (ch == '[') {
451 	    tp = tok822_alloc(TOK822_DOMLIT, (char *) 0);
452 	    COLLECT_SKIP_LAST(tp, str, ch, ch != ']');
453 	} else if (ch == '"') {
454 	    tp = tok822_alloc(TOK822_QSTRING, (char *) 0);
455 	    COLLECT_SKIP_LAST(tp, str, ch, ch != '"');
456 	} else if (ch != '\\' && strchr(tok822_opchar, ch)) {
457 	    tp = tok822_alloc(ch, (char *) 0);
458 	} else {
459 	    tp = tok822_alloc(TOK822_ATOM, (char *) 0);
460 	    str -= 1;				/* \ may be first */
461 	    COLLECT(tp, str, ch, !IS_SPACE_TAB_CR_LF(ch) && !strchr(tok822_opchar, ch));
462 	    tok822_quote_atom(tp);
463 	}
464 	if (head == 0) {
465 	    head = tail = tp;
466 	    while (tail->next)
467 		tail = tail->next;
468 	} else {
469 	    tail = tok822_append(tail, tp);
470 	}
471 	if (tok_count_limit > 0 && ++tok_count >= tok_count_limit)
472 	    break;
473     }
474     if (tailp)
475 	*tailp = tail;
476     return (head);
477 }
478 
479 /* tok822_parse_limit - translate external string to token tree */
480 
tok822_parse_limit(const char * str,int tok_count_limit)481 TOK822 *tok822_parse_limit(const char *str, int tok_count_limit)
482 {
483     TOK822 *head;
484     TOK822 *tail;
485     TOK822 *right;
486     TOK822 *first_token;
487     TOK822 *last_token;
488     TOK822 *tp;
489     int     state;
490 
491     /*
492      * First, tokenize the string, from left to right. We are not allowed to
493      * throw away any information that we do not understand. With a flat
494      * token list that contains all tokens, we can always convert back to
495      * string form.
496      */
497     if ((first_token = tok822_scan_limit(str, &last_token, tok_count_limit)) == 0)
498 	return (0);
499 
500     /*
501      * For convenience, sandwich the token list between two sentinel tokens.
502      */
503 #define GLUE(left,rite) { left->next = rite; rite->prev = left; }
504 
505     head = tok822_alloc(0, (char *) 0);
506     GLUE(head, first_token);
507     tail = tok822_alloc(0, (char *) 0);
508     GLUE(last_token, tail);
509 
510     /*
511      * Next step is to transform the token list into a parse tree. This is
512      * done most conveniently from right to left. If there is something that
513      * we do not understand, just leave it alone, don't throw it away. The
514      * address information that we're looking for sits in-between the current
515      * node (tp) and the one called right. Add missing commas on the fly.
516      */
517     state = DO_WORD;
518     right = tail;
519     tp = tail->prev;
520     while (tp->type) {
521 	if (tp->type == TOK822_COMMENT) {	/* move comment to the side */
522 	    MOVE_COMMENT_AND_CONTINUE(tp, right);
523 	} else if (tp->type == ';') {		/* rh side of named group */
524 	    right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA);
525 	    state = DO_GROUP | DO_WORD;
526 	} else if (tp->type == ':' && (state & DO_GROUP) != 0) {
527 	    tp->type = TOK822_STARTGRP;
528 	    (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
529 	    SKIP(tp, tp->type != ',');
530 	    right = tp;
531 	    continue;
532 	} else if (tp->type == '>') {		/* rh side of <route> */
533 	    right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA);
534 	    SKIP_MOVE_COMMENT(tp, tp->type != '<', right);
535 	    (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
536 	    SKIP(tp, tp->type > 0xff || strchr(">;,:", tp->type) == 0);
537 	    right = tp;
538 	    state |= DO_WORD;
539 	    continue;
540 	} else if (tp->type == TOK822_ATOM || tp->type == TOK822_QSTRING
541 		   || tp->type == TOK822_DOMLIT) {
542 	    if ((state & DO_WORD) == 0)
543 		right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA)->next;
544 	    state &= ~DO_WORD;
545 	} else if (tp->type == ',') {
546 	    right = tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
547 	    state |= DO_WORD;
548 	} else {
549 	    state |= DO_WORD;
550 	}
551 	tp = tp->prev;
552     }
553     (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
554 
555     /*
556      * Discard the sentinel tokens on the left and right extremes. Properly
557      * terminate the resulting list.
558      */
559     tp = (head->next != tail ? head->next : 0);
560     tok822_cut_before(head->next);
561     tok822_free(head);
562     tok822_cut_before(tail);
563     tok822_free(tail);
564     return (tp);
565 }
566 
567 /* tok822_quote_atom - see if an atom needs quoting when externalized */
568 
tok822_quote_atom(TOK822 * tp)569 static void tok822_quote_atom(TOK822 *tp)
570 {
571     char   *cp;
572     int     ch;
573 
574     /*
575      * RFC 822 expects 7-bit data. Rather than quoting every 8-bit character
576      * (and still passing it on as 8-bit data) we leave 8-bit data alone.
577      */
578     for (cp = vstring_str(tp->vstr); (ch = *(unsigned char *) cp) != 0; cp++) {
579 	if ( /* !ISASCII(ch) || */ ch == ' '
580 	    || ISCNTRL(ch) || strchr(tok822_opchar, ch)) {
581 	    tp->type = TOK822_QSTRING;
582 	    break;
583 	}
584     }
585 }
586 
587 /* tok822_comment - tokenize comment */
588 
tok822_comment(TOK822 * tp,const char * str)589 static const char *tok822_comment(TOK822 *tp, const char *str)
590 {
591     int     level = 1;
592     int     ch;
593 
594     /*
595      * XXX We cheat by storing comments in their external form. Otherwise it
596      * would be a royal pain to preserve \ before (. That would require a
597      * recursive parser; the easy to implement stack-based recursion would be
598      * too expensive.
599      */
600     VSTRING_ADDCH(tp->vstr, '(');
601 
602     while ((ch = *(unsigned char *) str) != 0) {
603 	VSTRING_ADDCH(tp->vstr, ch);
604 	str++;
605 	if (ch == '(') {			/* comments can nest! */
606 	    level++;
607 	} else if (ch == ')') {
608 	    if (--level == 0)
609 		break;
610 	} else if (ch == '\\') {
611 	    if ((ch = *(unsigned char *) str) == 0)
612 		break;
613 	    VSTRING_ADDCH(tp->vstr, ch);
614 	    str++;
615 	}
616     }
617     VSTRING_TERMINATE(tp->vstr);
618     return (str);
619 }
620 
621 /* tok822_group - cluster a group of tokens */
622 
tok822_group(int group_type,TOK822 * left,TOK822 * right,int sync_type)623 static TOK822 *tok822_group(int group_type, TOK822 *left, TOK822 *right, int sync_type)
624 {
625     TOK822 *group;
626     TOK822 *sync;
627     TOK822 *first;
628 
629     /*
630      * Cluster the tokens between left and right under their own parse tree
631      * node. Optionally insert a resync token.
632      */
633     if (left != right && (first = left->next) != right) {
634 	tok822_cut_before(right);
635 	tok822_cut_before(first);
636 	group = tok822_alloc(group_type, (char *) 0);
637 	tok822_sub_append(group, first);
638 	tok822_append(left, group);
639 	tok822_append(group, right);
640 	if (sync_type) {
641 	    sync = tok822_alloc(sync_type, (char *) 0);
642 	    tok822_append(left, sync);
643 	}
644     }
645     return (left);
646 }
647 
648 /* tok822_scan_addr - convert external address string to address token */
649 
tok822_scan_addr(const char * addr)650 TOK822 *tok822_scan_addr(const char *addr)
651 {
652     TOK822 *tree = tok822_alloc(TOK822_ADDR, (char *) 0);
653 
654     tree->head = tok822_scan(addr, &tree->tail);
655     return (tree);
656 }
657 
658 #ifdef TEST
659 
660 #include <unistd.h>
661 #include <vstream.h>
662 #include <readlline.h>
663 
664 /* tok822_print - display token */
665 
tok822_print(TOK822 * list,int indent)666 static void tok822_print(TOK822 *list, int indent)
667 {
668     TOK822 *tp;
669 
670     for (tp = list; tp; tp = tp->next) {
671 	if (tp->type < TOK822_MINTOK) {
672 	    vstream_printf("%*s %s \"%c\"\n", indent, "", "OP", tp->type);
673 	} else if (tp->type == TOK822_ADDR) {
674 	    vstream_printf("%*s %s\n", indent, "", "address");
675 	    tok822_print(tp->head, indent + 2);
676 	} else if (tp->type == TOK822_STARTGRP) {
677 	    vstream_printf("%*s %s\n", indent, "", "group \":\"");
678 	} else {
679 	    vstream_printf("%*s %s \"%s\"\n", indent, "",
680 			   tp->type == TOK822_COMMENT ? "comment" :
681 			   tp->type == TOK822_ATOM ? "atom" :
682 			   tp->type == TOK822_QSTRING ? "quoted string" :
683 			   tp->type == TOK822_DOMLIT ? "domain literal" :
684 			   tp->type == TOK822_ADDR ? "address" :
685 			   "unknown\n", vstring_str(tp->vstr));
686 	}
687     }
688 }
689 
main(int unused_argc,char ** unused_argv)690 int     main(int unused_argc, char **unused_argv)
691 {
692     VSTRING *vp = vstring_alloc(100);
693     TOK822 *list;
694     VSTRING *buf = vstring_alloc(100);
695 
696 #define TEST_TOKEN_LIMIT 20
697 
698     while (readlline(buf, VSTREAM_IN, (int *) 0)) {
699 	while (VSTRING_LEN(buf) > 0 && vstring_end(buf)[-1] == '\n') {
700 	    vstring_end(buf)[-1] = 0;
701 	    vstring_truncate(buf, VSTRING_LEN(buf) - 1);
702 	}
703 	if (!isatty(vstream_fileno(VSTREAM_IN)))
704 	    vstream_printf(">>>%s<<<\n\n", vstring_str(buf));
705 	list = tok822_parse_limit(vstring_str(buf), TEST_TOKEN_LIMIT);
706 	vstream_printf("Parse tree:\n");
707 	tok822_print(list, 0);
708 	vstream_printf("\n");
709 
710 	vstream_printf("Internalized:\n%s\n\n",
711 		vstring_str(tok822_internalize(vp, list, TOK822_STR_DEFL)));
712 	vstream_fflush(VSTREAM_OUT);
713 	vstream_printf("Externalized, no newlines inserted:\n%s\n\n",
714 		       vstring_str(tok822_externalize(vp, list,
715 				       TOK822_STR_DEFL | TOK822_STR_TRNC)));
716 	vstream_fflush(VSTREAM_OUT);
717 	vstream_printf("Externalized, newlines inserted:\n%s\n\n",
718 		       vstring_str(tok822_externalize(vp, list,
719 		     TOK822_STR_DEFL | TOK822_STR_LINE | TOK822_STR_TRNC)));
720 	vstream_fflush(VSTREAM_OUT);
721 	tok822_free_tree(list);
722     }
723     vstring_free(vp);
724     vstring_free(buf);
725     return (0);
726 }
727 
728 #endif
729