xref: /plan9/sys/src/cmd/cpp/lex.c (revision 2a7824990d644563b93ed8d4abf1407c40b2087a)
1 #include <u.h>
2 #include <libc.h>
3 #include <stdio.h>
4 #include "cpp.h"
5 
6 /*
7  * lexical FSM encoding
8  *   when in state state, and one of the characters
9  *   in ch arrives, enter nextstate.
10  *   States >= S_SELF are either final, or at least require special action.
11  *   In 'fsm' there is a line for each state X charset X nextstate.
12  *   List chars that overwrite previous entries later (e.g. C_ALPH
13  *   can be overridden by '_' by a later entry; and C_XX is the
14  *   the universal set, and should always be first.
15  *   States above S_SELF are represented in the big table as negative values.
16  *   S_SELF and S_SELFB encode the resulting token type in the upper bits.
17  *   These actions differ in that S_SELF doesn't have a lookahead char,
18  *   S_SELFB does.
19  *
20  *   The encoding is blown out into a big table for time-efficiency.
21  *   Entries have
22  *      nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
23  */
24 
25 #define	MAXSTATE 32
26 #define	ACT(tok,act)	((tok<<7)+act)
27 #define	QBSBIT	0100
28 #define	GETACT(st)	(st>>7)&0x1ff
29 
30 #define	UTF2(c)		((c)>=0xA0 && (c)<0xE0)		/* 2-char UTF seq */
31 #define	UTF3(c)		((c)>=0xE0 && (c)<0xF0)		/* 3-char UTF seq */
32 
33 /* character classes */
34 #define	C_WS	1
35 #define	C_ALPH	2
36 #define	C_NUM	3
37 #define	C_EOF	4
38 #define	C_XX	5
39 
40 enum state {
41 	START=0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
42 	CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
43 	CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
44 	S_SELF=MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
45 	S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
46 };
47 
48 struct	fsm {
49 	int	state;		/* if in this state */
50 	uchar	ch[4];		/* and see one of these characters */
51 	int	nextstate;	/* enter this state if +ve */
52 };
53 
54 /*const*/ struct fsm fsm[] = {
55 	/* start state */
56 	START,	{ C_XX },	ACT(UNCLASS,S_SELF),
57 	START,	{ ' ', '\t', '\v', '\r' },	WS1,
58 	START,	{ C_NUM },	NUM1,
59 	START,	{ '.' },	NUM3,
60 	START,	{ C_ALPH },	ID1,
61 	START,	{ 'L' },	ST1,
62 	START,	{ '"' },	ST2,
63 	START,	{ '\'' },	CC1,
64 	START,	{ '/' },	COM1,
65 	START,	{ EOFC },	S_EOF,
66 	START,	{ '\n' },	S_NL,
67 	START,	{ '-' },	MINUS1,
68 	START,	{ '+' },	PLUS1,
69 	START,	{ '<' },	LT1,
70 	START,	{ '>' },	GT1,
71 	START,	{ '=' },	ASG1,
72 	START,	{ '!' },	NOT1,
73 	START,	{ '&' },	AND1,
74 	START,	{ '|' },	OR1,
75 	START,	{ '#' },	SHARP1,
76 	START,	{ '%' },	PCT1,
77 	START,	{ '[' },	ACT(SBRA,S_SELF),
78 	START,	{ ']' },	ACT(SKET,S_SELF),
79 	START,	{ '(' },	ACT(LP,S_SELF),
80 	START,	{ ')' },	ACT(RP,S_SELF),
81 	START,	{ '*' },	STAR1,
82 	START,	{ ',' },	ACT(COMMA,S_SELF),
83 	START,	{ '?' },	ACT(QUEST,S_SELF),
84 	START,	{ ':' },	ACT(COLON,S_SELF),
85 	START,	{ ';' },	ACT(SEMIC,S_SELF),
86 	START,	{ '{' },	ACT(CBRA,S_SELF),
87 	START,	{ '}' },	ACT(CKET,S_SELF),
88 	START,	{ '~' },	ACT(TILDE,S_SELF),
89 	START,	{ '^' },	CIRC1,
90 
91 	/* saw a digit */
92 	NUM1,	{ C_XX },	ACT(NUMBER,S_SELFB),
93 	NUM1,	{ C_NUM, C_ALPH, '.' },	NUM1,
94 	NUM1,	{ 'E', 'e' },	NUM2,
95 	NUM1,	{ '_' },	ACT(NUMBER,S_SELFB),
96 
97 	/* saw possible start of exponent, digits-e */
98 	NUM2,	{ C_XX },	ACT(NUMBER,S_SELFB),
99 	NUM2,	{ '+', '-' },	NUM1,
100 	NUM2,	{ C_NUM, C_ALPH },	NUM1,
101 	NUM2,	{ '_' },	ACT(NUMBER,S_SELFB),
102 
103 	/* saw a '.', which could be a number or an operator */
104 	NUM3,	{ C_XX },	ACT(DOT,S_SELFB),
105 	NUM3,	{ '.' },	DOTS1,
106 	NUM3,	{ C_NUM },	NUM1,
107 
108 	DOTS1,	{ C_XX },	ACT(UNCLASS, S_SELFB),
109 	DOTS1,	{ C_NUM },	NUM1,
110 	DOTS1,	{ '.' },	ACT(ELLIPS, S_SELF),
111 
112 	/* saw a letter or _ */
113 	ID1,	{ C_XX },	ACT(NAME,S_NAME),
114 	ID1,	{ C_ALPH, C_NUM },	ID1,
115 
116 	/* saw L (start of wide string?) */
117 	ST1,	{ C_XX },	ACT(NAME,S_NAME),
118 	ST1,	{ C_ALPH, C_NUM },	ID1,
119 	ST1,	{ '"' },	ST2,
120 	ST1,	{ '\'' },	CC1,
121 
122 	/* saw " beginning string */
123 	ST2,	{ C_XX },	ST2,
124 	ST2,	{ '"' },	ACT(STRING, S_SELF),
125 	ST2,	{ '\\' },	ST3,
126 	ST2,	{ '\n' },	S_STNL,
127 	ST2,	{ EOFC },	S_EOFSTR,
128 
129 	/* saw \ in string */
130 	ST3,	{ C_XX },	ST2,
131 	ST3,	{ '\n' },	S_STNL,
132 	ST3,	{ EOFC },	S_EOFSTR,
133 
134 	/* saw ' beginning character const */
135 	CC1,	{ C_XX },	CC1,
136 	CC1,	{ '\'' },	ACT(CCON, S_SELF),
137 	CC1,	{ '\\' },	CC2,
138 	CC1,	{ '\n' },	S_STNL,
139 	CC1,	{ EOFC },	S_EOFSTR,
140 
141 	/* saw \ in ccon */
142 	CC2,	{ C_XX },	CC1,
143 	CC2,	{ '\n' },	S_STNL,
144 	CC2,	{ EOFC },	S_EOFSTR,
145 
146 	/* saw /, perhaps start of comment */
147 	COM1,	{ C_XX },	ACT(SLASH, S_SELFB),
148 	COM1,	{ '=' },	ACT(ASSLASH, S_SELF),
149 	COM1,	{ '*' },	COM2,
150 	COM1,	{ '/' },	COM4,
151 
152 	/* saw "/*", start of comment */
153 	COM2,	{ C_XX },	COM2,
154 	COM2,	{ '\n' },	S_COMNL,
155 	COM2,	{ '*' },	COM3,
156 	COM2,	{ EOFC },	S_EOFCOM,
157 
158 	/* saw the * possibly ending a comment */
159 	COM3,	{ C_XX },	COM2,
160 	COM3,	{ '\n' },	S_COMNL,
161 	COM3,	{ '*' },	COM3,
162 	COM3,	{ '/' },	S_COMMENT,
163 	COM3,	{ EOFC },	S_EOFCOM,
164 
165 	/* // comment */
166 	COM4,	{ C_XX },	COM4,
167 	COM4,	{ '\n' },	S_NL,
168 	COM4,	{ EOFC },	S_EOFCOM,
169 
170 	/* saw white space, eat it up */
171 	WS1,	{ C_XX },	S_WS,
172 	WS1,	{ ' ', '\t', '\v', '\r'},	WS1,
173 
174 	/* saw -, check --, -=, -> */
175 	MINUS1,	{ C_XX },	ACT(MINUS, S_SELFB),
176 	MINUS1,	{ '-' },	ACT(MMINUS, S_SELF),
177 	MINUS1,	{ '=' },	ACT(ASMINUS,S_SELF),
178 	MINUS1,	{ '>' },	ACT(ARROW,S_SELF),
179 
180 	/* saw +, check ++, += */
181 	PLUS1,	{ C_XX },	ACT(PLUS, S_SELFB),
182 	PLUS1,	{ '+' },	ACT(PPLUS, S_SELF),
183 	PLUS1,	{ '=' },	ACT(ASPLUS, S_SELF),
184 
185 	/* saw <, check <<, <<=, <= */
186 	LT1,	{ C_XX },	ACT(LT, S_SELFB),
187 	LT1,	{ '<' },	LT2,
188 	LT1,	{ '=' },	ACT(LEQ, S_SELF),
189 	LT2,	{ C_XX },	ACT(LSH, S_SELFB),
190 	LT2,	{ '=' },	ACT(ASLSH, S_SELF),
191 
192 	/* saw >, check >>, >>=, >= */
193 	GT1,	{ C_XX },	ACT(GT, S_SELFB),
194 	GT1,	{ '>' },	GT2,
195 	GT1,	{ '=' },	ACT(GEQ, S_SELF),
196 	GT2,	{ C_XX },	ACT(RSH, S_SELFB),
197 	GT2,	{ '=' },	ACT(ASRSH, S_SELF),
198 
199 	/* = */
200 	ASG1,	{ C_XX },	ACT(ASGN, S_SELFB),
201 	ASG1,	{ '=' },	ACT(EQ, S_SELF),
202 
203 	/* ! */
204 	NOT1,	{ C_XX },	ACT(NOT, S_SELFB),
205 	NOT1,	{ '=' },	ACT(NEQ, S_SELF),
206 
207 	/* & */
208 	AND1,	{ C_XX },	ACT(AND, S_SELFB),
209 	AND1,	{ '&' },	ACT(LAND, S_SELF),
210 	AND1,	{ '=' },	ACT(ASAND, S_SELF),
211 
212 	/* | */
213 	OR1,	{ C_XX },	ACT(OR, S_SELFB),
214 	OR1,	{ '|' },	ACT(LOR, S_SELF),
215 	OR1,	{ '=' },	ACT(ASOR, S_SELF),
216 
217 	/* # */
218 	SHARP1,	{ C_XX },	ACT(SHARP, S_SELFB),
219 	SHARP1,	{ '#' },	ACT(DSHARP, S_SELF),
220 
221 	/* % */
222 	PCT1,	{ C_XX },	ACT(PCT, S_SELFB),
223 	PCT1,	{ '=' },	ACT(ASPCT, S_SELF),
224 
225 	/* * */
226 	STAR1,	{ C_XX },	ACT(STAR, S_SELFB),
227 	STAR1,	{ '=' },	ACT(ASSTAR, S_SELF),
228 
229 	/* ^ */
230 	CIRC1,	{ C_XX },	ACT(CIRC, S_SELFB),
231 	CIRC1,	{ '=' },	ACT(ASCIRC, S_SELF),
232 
233 	-1
234 };
235 
236 /* first index is char, second is state */
237 /* increase #states to power of 2 to encourage use of shift */
238 short	bigfsm[256][MAXSTATE];
239 
240 void
expandlex(void)241 expandlex(void)
242 {
243 	/*const*/ struct fsm *fp;
244 	int i, j, nstate;
245 
246 	for (fp = fsm; fp->state>=0; fp++) {
247 		for (i=0; fp->ch[i]; i++) {
248 			nstate = fp->nextstate;
249 			if (nstate >= S_SELF)
250 				nstate = ~nstate;
251 			switch (fp->ch[i]) {
252 
253 			case C_XX:		/* random characters */
254 				for (j=0; j<256; j++)
255 					bigfsm[j][fp->state] = nstate;
256 				continue;
257 			case C_ALPH:
258 				for (j=0; j<=256; j++)
259 					if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z'
260 					  || UTF2(j) || UTF3(j) || j=='_')
261 						bigfsm[j][fp->state] = nstate;
262 				continue;
263 			case C_NUM:
264 				for (j='0'; j<='9'; j++)
265 					bigfsm[j][fp->state] = nstate;
266 				continue;
267 			default:
268 				bigfsm[fp->ch[i]][fp->state] = nstate;
269 			}
270 		}
271 	}
272 	/* install special cases for ? (trigraphs),  \ (splicing), runes, and EOB */
273 	for (i=0; i<MAXSTATE; i++) {
274 		for (j=0; j<0xFF; j++)
275 			if (j=='?' || j=='\\' || UTF2(j) || UTF3(j)) {
276 				if (bigfsm[j][i]>0)
277 					bigfsm[j][i] = ~bigfsm[j][i];
278 				bigfsm[j][i] &= ~QBSBIT;
279 			}
280 		bigfsm[EOB][i] = ~S_EOB;
281 		if (bigfsm[EOFC][i]>=0)
282 			bigfsm[EOFC][i] = ~S_EOF;
283 	}
284 }
285 
286 void
fixlex(void)287 fixlex(void)
288 {
289 	/* do C++ comments? */
290 	if (Cplusplus==0)
291 		bigfsm['/'][COM1] = bigfsm['x'][COM1];
292 }
293 
294 /*
295  * fill in a row of tokens from input, terminated by NL or END
296  * First token is put at trp->lp.
297  * Reset is non-zero when the input buffer can be "rewound."
298  * The value is a flag indicating that possible macros have
299  * been seen in the row.
300  */
301 int
gettokens(Tokenrow * trp,int reset)302 gettokens(Tokenrow *trp, int reset)
303 {
304 	register int c, state, oldstate;
305 	register uchar *ip;
306 	register Token *tp, *maxp;
307 	int runelen;
308 	Source *s = cursource;
309 	int nmac = 0;
310 	extern char outbuf[];
311 
312 	tp = trp->lp;
313 	ip = s->inp;
314 	if (reset) {
315 		s->lineinc = 0;
316 		if (ip>=s->inl) {		/* nothing in buffer */
317 			s->inl = s->inb;
318 			fillbuf(s);
319 			ip = s->inp = s->inb;
320 		} else if (ip >= s->inb+(3*s->ins/4)) {
321 			memmove(s->inb, ip, 4+s->inl-ip);
322 			s->inl = s->inb+(s->inl-ip);
323 			ip = s->inp = s->inb;
324 		}
325 	}
326 	maxp = &trp->bp[trp->max];
327 	runelen = 1;
328 	for (;;) {
329 	   continue2:
330 		if (tp>=maxp) {
331 			trp->lp = tp;
332 			tp = growtokenrow(trp);
333 			maxp = &trp->bp[trp->max];
334 		}
335 		tp->type = UNCLASS;
336 		tp->hideset = 0;
337 		tp->t = ip;
338 		tp->wslen = 0;
339 		tp->flag = 0;
340 		state = START;
341 		for (;;) {
342 			oldstate = state;
343 			c = *ip;
344 			if ((state = bigfsm[c][state]) >= 0) {
345 				ip += runelen;
346 				runelen = 1;
347 				continue;
348 			}
349 			state = ~state;
350 		reswitch:
351 			switch (state&0177) {
352 			case S_SELF:
353 				ip += runelen;
354 				runelen = 1;
355 			case S_SELFB:
356 				tp->type = GETACT(state);
357 				tp->len = ip - tp->t;
358 				tp++;
359 				goto continue2;
360 
361 			case S_NAME:	/* like S_SELFB but with nmac check */
362 				tp->type = NAME;
363 				tp->len = ip - tp->t;
364 				nmac |= quicklook(tp->t[0], tp->len>1?tp->t[1]:0);
365 				tp++;
366 				goto continue2;
367 
368 			case S_WS:
369 				tp->wslen = ip - tp->t;
370 				tp->t = ip;
371 				state = START;
372 				continue;
373 
374 			default:
375 				if ((state&QBSBIT)==0) {
376 					ip += runelen;
377 					runelen = 1;
378 					continue;
379 				}
380 				state &= ~QBSBIT;
381 				s->inp = ip;
382 				if (c=='?') { 	/* check trigraph */
383 					if (trigraph(s)) {
384 						state = oldstate;
385 						continue;
386 					}
387 					goto reswitch;
388 				}
389 				if (c=='\\') { /* line-folding */
390 					if (foldline(s)) {
391 						s->lineinc++;
392 						state = oldstate;
393 						continue;
394 					}
395 					goto reswitch;
396 				}
397 				if (UTF2(c)) {
398 					runelen = 2;
399 					goto reswitch;
400 				}
401 				if (UTF3(c)) {
402 					runelen = 3;
403 					goto reswitch;
404 				}
405 				error(WARNING, "Lexical botch in cpp");
406 				ip += runelen;
407 				runelen = 1;
408 				continue;
409 
410 			case S_EOB:
411 				s->inp = ip;
412 				fillbuf(cursource);
413 				state = oldstate;
414 				continue;
415 
416 			case S_EOF:
417 				tp->type = END;
418 				tp->len = 0;
419 				s->inp = ip;
420 				if (tp!=trp->bp && (tp-1)->type!=NL && cursource->fd!=-1)
421 					error(WARNING,"No newline at end of file");
422 				trp->lp = tp+1;
423 				return nmac;
424 
425 			case S_STNL:
426 				error(ERROR, "Unterminated string or char const");
427 			case S_NL:
428 				tp->t = ip;
429 				tp->type = NL;
430 				tp->len = 1;
431 				tp->wslen = 0;
432 				s->lineinc++;
433 				s->inp = ip+1;
434 				trp->lp = tp+1;
435 				return nmac;
436 
437 			case S_EOFSTR:
438 				error(FATAL, "EOF in string or char constant");
439 				break;
440 
441 			case S_COMNL:
442 				s->lineinc++;
443 				state = COM2;
444 				ip += runelen;
445 				runelen = 1;
446  				if (ip >= s->inb+(7*s->ins/8)) { /* very long comment */
447 					memmove(tp->t, ip, 4+s->inl-ip);
448 					s->inl -= ip-tp->t;
449 					ip = tp->t+1;
450 				}
451 				continue;
452 
453 			case S_EOFCOM:
454 				error(WARNING, "EOF inside comment");
455 				--ip;
456 			case S_COMMENT:
457 				++ip;
458 				tp->t = ip;
459 				tp->t[-1] = ' ';
460 				tp->wslen = 1;
461 				state = START;
462 				continue;
463 			}
464 			break;
465 		}
466 		ip += runelen;
467 		runelen = 1;
468 		tp->len = ip - tp->t;
469 		tp++;
470 	}
471 }
472 
473 /* have seen ?; handle the trigraph it starts (if any) else 0 */
474 int
trigraph(Source * s)475 trigraph(Source *s)
476 {
477 	int c;
478 
479 	while (s->inp+2 >= s->inl && fillbuf(s)!=EOF)
480 		;
481 	if (s->inp[1]!='?')
482 		return 0;
483 	c = 0;
484 	switch(s->inp[2]) {
485 	case '=':
486 		c = '#'; break;
487 	case '(':
488 		c = '['; break;
489 	case '/':
490 		c = '\\'; break;
491 	case ')':
492 		c = ']'; break;
493 	case '\'':
494 		c = '^'; break;
495 	case '<':
496 		c = '{'; break;
497 	case '!':
498 		c = '|'; break;
499 	case '>':
500 		c = '}'; break;
501 	case '-':
502 		c = '~'; break;
503 	}
504 	if (c) {
505 		*s->inp = c;
506 		memmove(s->inp+1, s->inp+3, s->inl-s->inp+2);
507 		s->inl -= 2;
508 	}
509 	return c;
510 }
511 
512 int
foldline(Source * s)513 foldline(Source *s)
514 {
515 	int ncr = 0;
516 
517 recheck:
518 	while (s->inp+1 >= s->inl && fillbuf(s)!=EOF)
519 		;
520 	if (s->inp[ncr+1] == '\r') {	/* nonstandardly, ignore CR before line-folding */
521 		ncr++;
522 		goto recheck;
523 	}
524 	if (s->inp[ncr+1] == '\n') {
525 		memmove(s->inp, s->inp+2+ncr, s->inl-s->inp+3-ncr);
526 		s->inl -= 2+ncr;
527 		return 1;
528 	}
529 	return 0;
530 }
531 
532 int
fillbuf(Source * s)533 fillbuf(Source *s)
534 {
535 	int n;
536 
537 	while((char *)s->inl+s->ins/8 > (char *)s->inb+s->ins) {
538 		int l = s->inl - s->inb;
539 		int p = s->inp - s->inb;
540 		if(l < 0)
541 			error(FATAL, "negative end of input!?");
542 		if(p < 0)
543 			error(FATAL, "negative input pointer!?");
544 		/* double the buffer size and try again */
545 		s->ins *= 2;
546 		s->inb = dorealloc(s->inb, s->ins);
547 		s->inl = s->inb + l;
548 		s->inp = s->inb + p;
549 	}
550 	if (s->fd<0 || (n=read(s->fd, (char *)s->inl, s->ins/8)) <= 0)
551 		n = 0;
552 	if ((*s->inp&0xff) == EOB) /* sentinel character appears in input */
553 		*s->inp = EOFC;
554 	s->inl += n;
555 	s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOB;
556 	if (n==0) {
557 		s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOFC;
558 		return EOF;
559 	}
560 	return 0;
561 }
562 
563 /*
564  * Push down to new source of characters.
565  * If fd>0 and str==NULL, then from a file `name';
566  * if fd==-1 and str, then from the string.
567  */
568 Source *
setsource(char * name,int fd,char * str)569 setsource(char *name, int fd, char *str)
570 {
571 	Source *s = new(Source);
572 	int len;
573 
574 	s->line = 1;
575 	s->lineinc = 0;
576 	s->fd = fd;
577 	s->filename = name;
578 	s->next = cursource;
579 	s->ifdepth = 0;
580 	cursource = s;
581 	/* slop at right for EOB */
582 	if (str) {
583 		len = strlen(str);
584 		s->inb = domalloc(len+4);
585 		s->inp = s->inb;
586 		strncpy((char *)s->inp, str, len);
587 	} else {
588 		Dir *d;
589 		int junk;
590 		ulong length = 0;
591 		d = dirfstat(fd);
592 		if (d != nil) {
593 			length = d->length;
594 			free(d);
595 		}
596 		junk = length;
597 		if (junk<INS)
598 			junk = INS;
599 		s->inb = domalloc((junk)+4);
600 		s->inp = s->inb;
601 		len = 0;
602 	}
603 
604 	s->ins = INS;
605 	s->inl = s->inp+len;
606 	s->inl[0] = s->inl[1] = EOB;
607 	return s;
608 }
609 
610 void
unsetsource(void)611 unsetsource(void)
612 {
613 	Source *s = cursource;
614 
615 	if (s->fd>=0) {
616 		close(s->fd);
617 		dofree(s->inb);
618 	}
619 	cursource = s->next;
620 	dofree(s);
621 }
622