1 #include <u.h> 2 #include <libc.h> 3 #include <stdio.h> 4 #include "cpp.h" 5 6 /* 7 * lexical FSM encoding 8 * when in state state, and one of the characters 9 * in ch arrives, enter nextstate. 10 * States >= S_SELF are either final, or at least require special action. 11 * In 'fsm' there is a line for each state X charset X nextstate. 12 * List chars that overwrite previous entries later (e.g. C_ALPH 13 * can be overridden by '_' by a later entry; and C_XX is the 14 * the universal set, and should always be first. 15 * States above S_SELF are represented in the big table as negative values. 16 * S_SELF and S_SELFB encode the resulting token type in the upper bits. 17 * These actions differ in that S_SELF doesn't have a lookahead char, 18 * S_SELFB does. 19 * 20 * The encoding is blown out into a big table for time-efficiency. 21 * Entries have 22 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits. 23 */ 24 25 #define MAXSTATE 32 26 #define ACT(tok,act) ((tok<<7)+act) 27 #define QBSBIT 0100 28 #define GETACT(st) (st>>7)&0x1ff 29 30 #define UTF2(c) ((c)>=0xA0 && (c)<0xE0) /* 2-char UTF seq */ 31 #define UTF3(c) ((c)>=0xE0 && (c)<0xF0) /* 3-char UTF seq */ 32 33 /* character classes */ 34 #define C_WS 1 35 #define C_ALPH 2 36 #define C_NUM 3 37 #define C_EOF 4 38 #define C_XX 5 39 40 enum state { 41 START=0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4, 42 CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1, 43 CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1, 44 S_SELF=MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR, 45 S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME 46 }; 47 48 int tottok; 49 int tokkind[256]; 50 struct fsm { 51 int state; /* if in this state */ 52 uchar ch[4]; /* and see one of these characters */ 53 int nextstate; /* enter this state if +ve */ 54 }; 55 56 /*const*/ struct fsm fsm[] = { 57 /* start state */ 58 START, { C_XX }, ACT(UNCLASS,S_SELF), 59 START, { ' ', '\t', '\v', '\r' }, WS1, 60 START, { C_NUM }, NUM1, 61 START, { '.' }, NUM3, 62 START, { C_ALPH }, ID1, 63 START, { 'L' }, ST1, 64 START, { '"' }, ST2, 65 START, { '\'' }, CC1, 66 START, { '/' }, COM1, 67 START, { EOFC }, S_EOF, 68 START, { '\n' }, S_NL, 69 START, { '-' }, MINUS1, 70 START, { '+' }, PLUS1, 71 START, { '<' }, LT1, 72 START, { '>' }, GT1, 73 START, { '=' }, ASG1, 74 START, { '!' }, NOT1, 75 START, { '&' }, AND1, 76 START, { '|' }, OR1, 77 START, { '#' }, SHARP1, 78 START, { '%' }, PCT1, 79 START, { '[' }, ACT(SBRA,S_SELF), 80 START, { ']' }, ACT(SKET,S_SELF), 81 START, { '(' }, ACT(LP,S_SELF), 82 START, { ')' }, ACT(RP,S_SELF), 83 START, { '*' }, STAR1, 84 START, { ',' }, ACT(COMMA,S_SELF), 85 START, { '?' }, ACT(QUEST,S_SELF), 86 START, { ':' }, ACT(COLON,S_SELF), 87 START, { ';' }, ACT(SEMIC,S_SELF), 88 START, { '{' }, ACT(CBRA,S_SELF), 89 START, { '}' }, ACT(CKET,S_SELF), 90 START, { '~' }, ACT(TILDE,S_SELF), 91 START, { '^' }, CIRC1, 92 93 /* saw a digit */ 94 NUM1, { C_XX }, ACT(NUMBER,S_SELFB), 95 NUM1, { C_NUM, C_ALPH, '.' }, NUM1, 96 NUM1, { 'E', 'e' }, NUM2, 97 NUM1, { '_' }, ACT(NUMBER,S_SELFB), 98 99 /* saw possible start of exponent, digits-e */ 100 NUM2, { C_XX }, ACT(NUMBER,S_SELFB), 101 NUM2, { '+', '-' }, NUM1, 102 NUM2, { C_NUM, C_ALPH }, NUM1, 103 NUM2, { '_' }, ACT(NUMBER,S_SELFB), 104 105 /* saw a '.', which could be a number or an operator */ 106 NUM3, { C_XX }, ACT(DOT,S_SELFB), 107 NUM3, { '.' }, DOTS1, 108 NUM3, { C_NUM }, NUM1, 109 110 DOTS1, { C_XX }, ACT(UNCLASS, S_SELFB), 111 DOTS1, { C_NUM }, NUM1, 112 DOTS1, { '.' }, ACT(ELLIPS, S_SELF), 113 114 /* saw a letter or _ */ 115 ID1, { C_XX }, ACT(NAME,S_NAME), 116 ID1, { C_ALPH, C_NUM }, ID1, 117 118 /* saw L (start of wide string?) */ 119 ST1, { C_XX }, ACT(NAME,S_NAME), 120 ST1, { C_ALPH, C_NUM }, ID1, 121 ST1, { '"' }, ST2, 122 ST1, { '\'' }, CC1, 123 124 /* saw " beginning string */ 125 ST2, { C_XX }, ST2, 126 ST2, { '"' }, ACT(STRING, S_SELF), 127 ST2, { '\\' }, ST3, 128 ST2, { '\n' }, S_STNL, 129 ST2, { EOFC }, S_EOFSTR, 130 131 /* saw \ in string */ 132 ST3, { C_XX }, ST2, 133 ST3, { '\n' }, S_STNL, 134 ST3, { EOFC }, S_EOFSTR, 135 136 /* saw ' beginning character const */ 137 CC1, { C_XX }, CC1, 138 CC1, { '\'' }, ACT(CCON, S_SELF), 139 CC1, { '\\' }, CC2, 140 CC1, { '\n' }, S_STNL, 141 CC1, { EOFC }, S_EOFSTR, 142 143 /* saw \ in ccon */ 144 CC2, { C_XX }, CC1, 145 CC2, { '\n' }, S_STNL, 146 CC2, { EOFC }, S_EOFSTR, 147 148 /* saw /, perhaps start of comment */ 149 COM1, { C_XX }, ACT(SLASH, S_SELFB), 150 COM1, { '=' }, ACT(ASSLASH, S_SELF), 151 COM1, { '*' }, COM2, 152 COM1, { '/' }, COM4, 153 154 /* saw "/*", start of comment */ 155 COM2, { C_XX }, COM2, 156 COM2, { '\n' }, S_COMNL, 157 COM2, { '*' }, COM3, 158 COM2, { EOFC }, S_EOFCOM, 159 160 /* saw the * possibly ending a comment */ 161 COM3, { C_XX }, COM2, 162 COM3, { '\n' }, S_COMNL, 163 COM3, { '*' }, COM3, 164 COM3, { '/' }, S_COMMENT, 165 COM3, { EOFC }, S_EOFCOM, 166 167 /* // comment */ 168 COM4, { C_XX }, COM4, 169 COM4, { '\n' }, S_NL, 170 COM4, { EOFC }, S_EOFCOM, 171 172 /* saw white space, eat it up */ 173 WS1, { C_XX }, S_WS, 174 WS1, { ' ', '\t', '\v', '\r'}, WS1, 175 176 /* saw -, check --, -=, -> */ 177 MINUS1, { C_XX }, ACT(MINUS, S_SELFB), 178 MINUS1, { '-' }, ACT(MMINUS, S_SELF), 179 MINUS1, { '=' }, ACT(ASMINUS,S_SELF), 180 MINUS1, { '>' }, ACT(ARROW,S_SELF), 181 182 /* saw +, check ++, += */ 183 PLUS1, { C_XX }, ACT(PLUS, S_SELFB), 184 PLUS1, { '+' }, ACT(PPLUS, S_SELF), 185 PLUS1, { '=' }, ACT(ASPLUS, S_SELF), 186 187 /* saw <, check <<, <<=, <= */ 188 LT1, { C_XX }, ACT(LT, S_SELFB), 189 LT1, { '<' }, LT2, 190 LT1, { '=' }, ACT(LEQ, S_SELF), 191 LT2, { C_XX }, ACT(LSH, S_SELFB), 192 LT2, { '=' }, ACT(ASLSH, S_SELF), 193 194 /* saw >, check >>, >>=, >= */ 195 GT1, { C_XX }, ACT(GT, S_SELFB), 196 GT1, { '>' }, GT2, 197 GT1, { '=' }, ACT(GEQ, S_SELF), 198 GT2, { C_XX }, ACT(RSH, S_SELFB), 199 GT2, { '=' }, ACT(ASRSH, S_SELF), 200 201 /* = */ 202 ASG1, { C_XX }, ACT(ASGN, S_SELFB), 203 ASG1, { '=' }, ACT(EQ, S_SELF), 204 205 /* ! */ 206 NOT1, { C_XX }, ACT(NOT, S_SELFB), 207 NOT1, { '=' }, ACT(NEQ, S_SELF), 208 209 /* & */ 210 AND1, { C_XX }, ACT(AND, S_SELFB), 211 AND1, { '&' }, ACT(LAND, S_SELF), 212 AND1, { '=' }, ACT(ASAND, S_SELF), 213 214 /* | */ 215 OR1, { C_XX }, ACT(OR, S_SELFB), 216 OR1, { '|' }, ACT(LOR, S_SELF), 217 OR1, { '=' }, ACT(ASOR, S_SELF), 218 219 /* # */ 220 SHARP1, { C_XX }, ACT(SHARP, S_SELFB), 221 SHARP1, { '#' }, ACT(DSHARP, S_SELF), 222 223 /* % */ 224 PCT1, { C_XX }, ACT(PCT, S_SELFB), 225 PCT1, { '=' }, ACT(ASPCT, S_SELF), 226 227 /* * */ 228 STAR1, { C_XX }, ACT(STAR, S_SELFB), 229 STAR1, { '=' }, ACT(ASSTAR, S_SELF), 230 231 /* ^ */ 232 CIRC1, { C_XX }, ACT(CIRC, S_SELFB), 233 CIRC1, { '=' }, ACT(ASCIRC, S_SELF), 234 235 -1 236 }; 237 238 /* first index is char, second is state */ 239 /* increase #states to power of 2 to encourage use of shift */ 240 short bigfsm[256][MAXSTATE]; 241 242 void 243 expandlex(void) 244 { 245 /*const*/ struct fsm *fp; 246 int i, j, nstate; 247 248 for (fp = fsm; fp->state>=0; fp++) { 249 for (i=0; fp->ch[i]; i++) { 250 nstate = fp->nextstate; 251 if (nstate >= S_SELF) 252 nstate = ~nstate; 253 switch (fp->ch[i]) { 254 255 case C_XX: /* random characters */ 256 for (j=0; j<256; j++) 257 bigfsm[j][fp->state] = nstate; 258 continue; 259 case C_ALPH: 260 for (j=0; j<=256; j++) 261 if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z' 262 || UTF2(j) || UTF3(j) || j=='_') 263 bigfsm[j][fp->state] = nstate; 264 continue; 265 case C_NUM: 266 for (j='0'; j<='9'; j++) 267 bigfsm[j][fp->state] = nstate; 268 continue; 269 default: 270 bigfsm[fp->ch[i]][fp->state] = nstate; 271 } 272 } 273 } 274 /* install special cases for ? (trigraphs), \ (splicing), runes, and EOB */ 275 for (i=0; i<MAXSTATE; i++) { 276 for (j=0; j<0xFF; j++) 277 if (j=='?' || j=='\\' || UTF2(j) || UTF3(j)) { 278 if (bigfsm[j][i]>0) 279 bigfsm[j][i] = ~bigfsm[j][i]; 280 bigfsm[j][i] &= ~QBSBIT; 281 } 282 bigfsm[EOB][i] = ~S_EOB; 283 if (bigfsm[EOFC][i]>=0) 284 bigfsm[EOFC][i] = ~S_EOF; 285 } 286 } 287 288 void 289 fixlex(void) 290 { 291 /* do C++ comments? */ 292 if (Cplusplus==0) 293 bigfsm['/'][COM1] = bigfsm['x'][COM1]; 294 } 295 296 /* 297 * fill in a row of tokens from input, terminated by NL or END 298 * First token is put at trp->lp. 299 * Reset is non-zero when the input buffer can be "rewound." 300 * The value is a flag indicating that possible macros have 301 * been seen in the row. 302 */ 303 int 304 gettokens(Tokenrow *trp, int reset) 305 { 306 register int c, state, oldstate; 307 register uchar *ip; 308 register Token *tp, *maxp; 309 int runelen; 310 Source *s = cursource; 311 int nmac = 0; 312 extern char outbuf[]; 313 314 tp = trp->lp; 315 ip = s->inp; 316 if (reset) { 317 s->lineinc = 0; 318 if (ip>=s->inl) { /* nothing in buffer */ 319 s->inl = s->inb; 320 fillbuf(s); 321 ip = s->inp = s->inb; 322 } else if (ip >= s->inb+(3*INS/4)) { 323 memmove(s->inb, ip, 4+s->inl-ip); 324 s->inl = s->inb+(s->inl-ip); 325 ip = s->inp = s->inb; 326 } 327 } 328 maxp = &trp->bp[trp->max]; 329 runelen = 1; 330 for (;;) { 331 continue2: 332 if (tp>=maxp) { 333 trp->lp = tp; 334 tp = growtokenrow(trp); 335 maxp = &trp->bp[trp->max]; 336 } 337 tp->type = UNCLASS; 338 tp->hideset = 0; 339 tp->t = ip; 340 tp->wslen = 0; 341 tp->flag = 0; 342 state = START; 343 for (;;) { 344 oldstate = state; 345 c = *ip; 346 if ((state = bigfsm[c][state]) >= 0) { 347 ip += runelen; 348 runelen = 1; 349 continue; 350 } 351 state = ~state; 352 reswitch: 353 switch (state&0177) { 354 case S_SELF: 355 ip += runelen; 356 runelen = 1; 357 case S_SELFB: 358 tp->type = GETACT(state); 359 tp->len = ip - tp->t; 360 tp++; 361 goto continue2; 362 363 case S_NAME: /* like S_SELFB but with nmac check */ 364 tp->type = NAME; 365 tp->len = ip - tp->t; 366 nmac |= quicklook(tp->t[0], tp->len>1?tp->t[1]:0); 367 tp++; 368 goto continue2; 369 370 case S_WS: 371 tp->wslen = ip - tp->t; 372 tp->t = ip; 373 state = START; 374 continue; 375 376 default: 377 if ((state&QBSBIT)==0) { 378 ip += runelen; 379 runelen = 1; 380 continue; 381 } 382 state &= ~QBSBIT; 383 s->inp = ip; 384 if (c=='?') { /* check trigraph */ 385 if (trigraph(s)) { 386 state = oldstate; 387 continue; 388 } 389 goto reswitch; 390 } 391 if (c=='\\') { /* line-folding */ 392 if (foldline(s)) { 393 s->lineinc++; 394 state = oldstate; 395 continue; 396 } 397 goto reswitch; 398 } 399 if (UTF2(c)) { 400 runelen = 2; 401 goto reswitch; 402 } 403 if (UTF3(c)) { 404 runelen = 3; 405 goto reswitch; 406 } 407 error(WARNING, "Lexical botch in cpp"); 408 ip += runelen; 409 runelen = 1; 410 continue; 411 412 case S_EOB: 413 s->inp = ip; 414 fillbuf(cursource); 415 state = oldstate; 416 continue; 417 418 case S_EOF: 419 tp->type = END; 420 tp->len = 0; 421 s->inp = ip; 422 if (tp!=trp->bp && (tp-1)->type!=NL && cursource->fd!=-1) 423 error(WARNING,"No newline at end of file"); 424 trp->lp = tp+1; 425 return nmac; 426 427 case S_STNL: 428 error(ERROR, "Unterminated string or char const"); 429 case S_NL: 430 tp->t = ip; 431 tp->type = NL; 432 tp->len = 1; 433 tp->wslen = 0; 434 s->lineinc++; 435 s->inp = ip+1; 436 trp->lp = tp+1; 437 return nmac; 438 439 case S_EOFSTR: 440 error(FATAL, "EOF in string or char constant"); 441 break; 442 443 case S_COMNL: 444 s->lineinc++; 445 state = COM2; 446 ip += runelen; 447 runelen = 1; 448 if (ip >= s->inb+(7*INS/8)) { /* very long comment */ 449 memmove(tp->t, ip, 4+s->inl-ip); 450 s->inl -= ip-tp->t; 451 ip = tp->t+1; 452 } 453 continue; 454 455 case S_EOFCOM: 456 error(WARNING, "EOF inside comment"); 457 --ip; 458 case S_COMMENT: 459 ++ip; 460 tp->t = ip; 461 tp->t[-1] = ' '; 462 tp->wslen = 1; 463 state = START; 464 continue; 465 } 466 break; 467 } 468 ip += runelen; 469 runelen = 1; 470 tp->len = ip - tp->t; 471 tp++; 472 } 473 return 0; 474 } 475 476 /* have seen ?; handle the trigraph it starts (if any) else 0 */ 477 int 478 trigraph(Source *s) 479 { 480 int c; 481 482 while (s->inp+2 >= s->inl && fillbuf(s)!=EOF) 483 ; 484 if (s->inp[1]!='?') 485 return 0; 486 c = 0; 487 switch(s->inp[2]) { 488 case '=': 489 c = '#'; break; 490 case '(': 491 c = '['; break; 492 case '/': 493 c = '\\'; break; 494 case ')': 495 c = ']'; break; 496 case '\'': 497 c = '^'; break; 498 case '<': 499 c = '{'; break; 500 case '!': 501 c = '|'; break; 502 case '>': 503 c = '}'; break; 504 case '-': 505 c = '~'; break; 506 } 507 if (c) { 508 *s->inp = c; 509 memmove(s->inp+1, s->inp+3, s->inl-s->inp+2); 510 s->inl -= 2; 511 } 512 return c; 513 } 514 515 int 516 foldline(Source *s) 517 { 518 while (s->inp+1 >= s->inl && fillbuf(s)!=EOF) 519 ; 520 if (s->inp[1] == '\n') { 521 memmove(s->inp, s->inp+2, s->inl-s->inp+3); 522 s->inl -= 2; 523 return 1; 524 } 525 return 0; 526 } 527 528 int 529 fillbuf(Source *s) 530 { 531 int n; 532 533 if ((char *)s->inl+INS/8 > (char *)s->inb+INS) 534 error(FATAL, "Input buffer overflow"); 535 if (s->fd<0 || (n=read(s->fd, (char *)s->inl, INS/8)) <= 0) 536 n = 0; 537 if ((*s->inp&0xff) == EOB) /* sentinel character appears in input */ 538 *s->inp = EOFC; 539 s->inl += n; 540 s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOB; 541 if (n==0) { 542 s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOFC; 543 return EOF; 544 } 545 return 0; 546 } 547 548 /* 549 * Push down to new source of characters. 550 * If fd>0 and str==NULL, then from a file `name'; 551 * if fd==-1 and str, then from the string. 552 */ 553 Source * 554 setsource(char *name, int fd, char *str) 555 { 556 Source *s = new(Source); 557 int len; 558 559 s->line = 1; 560 s->lineinc = 0; 561 s->fd = fd; 562 s->filename = name; 563 s->next = cursource; 564 s->ifdepth = 0; 565 cursource = s; 566 /* slop at right for EOB */ 567 if (str) { 568 len = strlen(str); 569 s->inb = domalloc(len+4); 570 s->inp = s->inb; 571 strncpy((char *)s->inp, str, len); 572 } else { 573 Dir d; 574 int junk; 575 if (dirfstat(fd, &d) < 0) 576 d.length = 0; 577 junk = d.length; 578 if (junk<INS) 579 junk = INS; 580 s->inb = domalloc((junk)+4); 581 s->inp = s->inb; 582 len = 0; 583 } 584 s->inl = s->inp+len; 585 s->inl[0] = s->inl[1] = EOB; 586 return s; 587 } 588 589 void 590 unsetsource(void) 591 { 592 Source *s = cursource; 593 594 if (s->fd>=0) { 595 close(s->fd); 596 dofree(s->inb); 597 } 598 cursource = s->next; 599 dofree(s); 600 } 601