1 #include <u.h> 2 #include <libc.h> 3 #include <stdio.h> 4 #include "cpp.h" 5 6 /* 7 * lexical FSM encoding 8 * when in state state, and one of the characters 9 * in ch arrives, enter nextstate. 10 * States >= S_SELF are either final, or at least require special action. 11 * In 'fsm' there is a line for each state X charset X nextstate. 12 * List chars that overwrite previous entries later (e.g. C_ALPH 13 * can be overridden by '_' by a later entry; and C_XX is the 14 * the universal set, and should always be first. 15 * States above S_SELF are represented in the big table as negative values. 16 * S_SELF and S_SELFB encode the resulting token type in the upper bits. 17 * These actions differ in that S_SELF doesn't have a lookahead char, 18 * S_SELFB does. 19 * 20 * The encoding is blown out into a big table for time-efficiency. 21 * Entries have 22 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits. 23 */ 24 25 #define MAXSTATE 32 26 #define ACT(tok,act) ((tok<<7)+act) 27 #define QBSBIT 0100 28 #define GETACT(st) (st>>7)&0x1ff 29 30 #define UTF2(c) ((c)>=0xA0 && (c)<0xE0) /* 2-char UTF seq */ 31 #define UTF3(c) ((c)>=0xE0 && (c)<0xF0) /* 3-char UTF seq */ 32 33 /* character classes */ 34 #define C_WS 1 35 #define C_ALPH 2 36 #define C_NUM 3 37 #define C_EOF 4 38 #define C_XX 5 39 40 enum state { 41 START=0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4, 42 CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1, 43 CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1, 44 S_SELF=MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR, 45 S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME 46 }; 47 48 int tottok; 49 int tokkind[256]; 50 struct fsm { 51 int state; /* if in this state */ 52 uchar ch[4]; /* and see one of these characters */ 53 int nextstate; /* enter this state if +ve */ 54 }; 55 56 /*const*/ struct fsm fsm[] = { 57 /* start state */ 58 START, { C_XX }, ACT(UNCLASS,S_SELF), 59 START, { ' ', '\t', '\v' }, WS1, 60 START, { C_NUM }, NUM1, 61 START, { '.' }, NUM3, 62 START, { C_ALPH }, ID1, 63 START, { 'L' }, ST1, 64 START, { '"' }, ST2, 65 START, { '\'' }, CC1, 66 START, { '/' }, COM1, 67 START, { EOFC }, S_EOF, 68 START, { '\n' }, S_NL, 69 START, { '-' }, MINUS1, 70 START, { '+' }, PLUS1, 71 START, { '<' }, LT1, 72 START, { '>' }, GT1, 73 START, { '=' }, ASG1, 74 START, { '!' }, NOT1, 75 START, { '&' }, AND1, 76 START, { '|' }, OR1, 77 START, { '#' }, SHARP1, 78 START, { '%' }, PCT1, 79 START, { '[' }, ACT(SBRA,S_SELF), 80 START, { ']' }, ACT(SKET,S_SELF), 81 START, { '(' }, ACT(LP,S_SELF), 82 START, { ')' }, ACT(RP,S_SELF), 83 START, { '*' }, STAR1, 84 START, { ',' }, ACT(COMMA,S_SELF), 85 START, { '?' }, ACT(QUEST,S_SELF), 86 START, { ':' }, ACT(COLON,S_SELF), 87 START, { ';' }, ACT(SEMIC,S_SELF), 88 START, { '{' }, ACT(CBRA,S_SELF), 89 START, { '}' }, ACT(CKET,S_SELF), 90 START, { '~' }, ACT(TILDE,S_SELF), 91 START, { '^' }, CIRC1, 92 93 /* saw a digit */ 94 NUM1, { C_XX }, ACT(NUMBER,S_SELFB), 95 NUM1, { C_NUM, C_ALPH, '.' }, NUM1, 96 NUM1, { 'E', 'e' }, NUM2, 97 NUM1, { '_' }, ACT(NUMBER,S_SELFB), 98 99 /* saw possible start of exponent, digits-e */ 100 NUM2, { C_XX }, ACT(NUMBER,S_SELFB), 101 NUM2, { '+', '-' }, NUM1, 102 NUM2, { C_NUM, C_ALPH }, NUM1, 103 NUM2, { '_' }, ACT(NUMBER,S_SELFB), 104 105 /* saw a '.', which could be a number or an operator */ 106 NUM3, { C_XX }, ACT(DOT,S_SELFB), 107 NUM3, { '.' }, DOTS1, 108 NUM3, { C_NUM }, NUM1, 109 110 DOTS1, { C_XX }, ACT(UNCLASS, S_SELFB), 111 DOTS1, { C_NUM }, NUM1, 112 DOTS1, { '.' }, ACT(ELLIPS, S_SELF), 113 114 /* saw a letter or _ */ 115 ID1, { C_XX }, ACT(NAME,S_NAME), 116 ID1, { C_ALPH, C_NUM }, ID1, 117 118 /* saw L (start of wide string?) */ 119 ST1, { C_XX }, ACT(NAME,S_NAME), 120 ST1, { C_ALPH, C_NUM }, ID1, 121 ST1, { '"' }, ST2, 122 ST1, { '\'' }, CC1, 123 124 /* saw " beginning string */ 125 ST2, { C_XX }, ST2, 126 ST2, { '"' }, ACT(STRING, S_SELF), 127 ST2, { '\\' }, ST3, 128 ST2, { '\n' }, S_STNL, 129 ST2, { EOFC }, S_EOFSTR, 130 131 /* saw \ in string */ 132 ST3, { C_XX }, ST2, 133 ST3, { '\n' }, S_STNL, 134 ST3, { EOFC }, S_EOFSTR, 135 136 /* saw ' beginning character const */ 137 CC1, { C_XX }, CC1, 138 CC1, { '\'' }, ACT(CCON, S_SELF), 139 CC1, { '\\' }, CC2, 140 CC1, { '\n' }, S_STNL, 141 CC1, { EOFC }, S_EOFSTR, 142 143 /* saw \ in ccon */ 144 CC2, { C_XX }, CC1, 145 CC2, { '\n' }, S_STNL, 146 CC2, { EOFC }, S_EOFSTR, 147 148 /* saw /, perhaps start of comment */ 149 COM1, { C_XX }, ACT(SLASH, S_SELFB), 150 COM1, { '=' }, ACT(ASSLASH, S_SELF), 151 COM1, { '*' }, COM2, 152 COM1, { '/' }, COM4, 153 154 /* saw "/*", start of comment */ 155 COM2, { C_XX }, COM2, 156 COM2, { '\n' }, S_COMNL, 157 COM2, { '*' }, COM3, 158 COM2, { EOFC }, S_EOFCOM, 159 160 /* saw the * possibly ending a comment */ 161 COM3, { C_XX }, COM2, 162 COM3, { '\n' }, S_COMNL, 163 COM3, { '*' }, COM3, 164 COM3, { '/' }, S_COMMENT, 165 166 /* // comment */ 167 COM4, { C_XX }, COM4, 168 COM4, { '\n' }, S_NL, 169 COM4, { EOFC }, S_EOFCOM, 170 171 /* saw white space, eat it up */ 172 WS1, { C_XX }, S_WS, 173 WS1, { ' ', '\t', '\v' }, WS1, 174 175 /* saw -, check --, -=, -> */ 176 MINUS1, { C_XX }, ACT(MINUS, S_SELFB), 177 MINUS1, { '-' }, ACT(MMINUS, S_SELF), 178 MINUS1, { '=' }, ACT(ASMINUS,S_SELF), 179 MINUS1, { '>' }, ACT(ARROW,S_SELF), 180 181 /* saw +, check ++, += */ 182 PLUS1, { C_XX }, ACT(PLUS, S_SELFB), 183 PLUS1, { '+' }, ACT(PPLUS, S_SELF), 184 PLUS1, { '=' }, ACT(ASPLUS, S_SELF), 185 186 /* saw <, check <<, <<=, <= */ 187 LT1, { C_XX }, ACT(LT, S_SELFB), 188 LT1, { '<' }, LT2, 189 LT1, { '=' }, ACT(LEQ, S_SELF), 190 LT2, { C_XX }, ACT(LSH, S_SELFB), 191 LT2, { '=' }, ACT(ASLSH, S_SELF), 192 193 /* saw >, check >>, >>=, >= */ 194 GT1, { C_XX }, ACT(GT, S_SELFB), 195 GT1, { '>' }, GT2, 196 GT1, { '=' }, ACT(GEQ, S_SELF), 197 GT2, { C_XX }, ACT(RSH, S_SELFB), 198 GT2, { '=' }, ACT(ASRSH, S_SELF), 199 200 /* = */ 201 ASG1, { C_XX }, ACT(ASGN, S_SELFB), 202 ASG1, { '=' }, ACT(EQ, S_SELF), 203 204 /* ! */ 205 NOT1, { C_XX }, ACT(NOT, S_SELFB), 206 NOT1, { '=' }, ACT(NEQ, S_SELF), 207 208 /* & */ 209 AND1, { C_XX }, ACT(AND, S_SELFB), 210 AND1, { '&' }, ACT(LAND, S_SELF), 211 AND1, { '=' }, ACT(ASAND, S_SELF), 212 213 /* | */ 214 OR1, { C_XX }, ACT(OR, S_SELFB), 215 OR1, { '|' }, ACT(LOR, S_SELF), 216 OR1, { '=' }, ACT(ASOR, S_SELF), 217 218 /* # */ 219 SHARP1, { C_XX }, ACT(SHARP, S_SELFB), 220 SHARP1, { '#' }, ACT(DSHARP, S_SELF), 221 222 /* % */ 223 PCT1, { C_XX }, ACT(PCT, S_SELFB), 224 PCT1, { '=' }, ACT(ASPCT, S_SELF), 225 226 /* * */ 227 STAR1, { C_XX }, ACT(STAR, S_SELFB), 228 STAR1, { '=' }, ACT(ASSTAR, S_SELF), 229 230 /* ^ */ 231 CIRC1, { C_XX }, ACT(CIRC, S_SELFB), 232 CIRC1, { '=' }, ACT(ASCIRC, S_SELF), 233 234 -1 235 }; 236 237 /* first index is char, second is state */ 238 /* increase #states to power of 2 to encourage use of shift */ 239 short bigfsm[256][MAXSTATE]; 240 241 void 242 expandlex(void) 243 { 244 /*const*/ struct fsm *fp; 245 int i, j, nstate; 246 247 for (fp = fsm; fp->state>=0; fp++) { 248 for (i=0; fp->ch[i]; i++) { 249 nstate = fp->nextstate; 250 if (nstate >= S_SELF) 251 nstate = ~nstate; 252 switch (fp->ch[i]) { 253 254 case C_XX: /* random characters */ 255 for (j=0; j<256; j++) 256 bigfsm[j][fp->state] = nstate; 257 continue; 258 case C_ALPH: 259 for (j=0; j<=256; j++) 260 if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z' 261 || UTF2(j) || UTF3(j) || j=='_') 262 bigfsm[j][fp->state] = nstate; 263 continue; 264 case C_NUM: 265 for (j='0'; j<='9'; j++) 266 bigfsm[j][fp->state] = nstate; 267 continue; 268 default: 269 bigfsm[fp->ch[i]][fp->state] = nstate; 270 } 271 } 272 } 273 /* install special cases for ? (trigraphs), \ (splicing), runes, and EOB */ 274 for (i=0; i<MAXSTATE; i++) { 275 for (j=0; j<0xFF; j++) 276 if (j=='?' || j=='\\' || UTF2(j) || UTF3(j)) { 277 if (bigfsm[j][i]>0) 278 bigfsm[j][i] = ~bigfsm[j][i]; 279 bigfsm[j][i] &= ~QBSBIT; 280 } 281 bigfsm[EOB][i] = ~S_EOB; 282 if (bigfsm[EOFC][i]>=0) 283 bigfsm[EOFC][i] = ~S_EOF; 284 } 285 } 286 287 void 288 fixlex(void) 289 { 290 /* do C++ comments? */ 291 if (Cplusplus==0) 292 bigfsm['/'][COM1] = bigfsm['x'][COM1]; 293 } 294 295 /* 296 * fill in a row of tokens from input, terminated by NL or END 297 * First token is put at trp->lp. 298 * Reset is non-zero when the input buffer can be "rewound." 299 * The value is a flag indicating that possible macros have 300 * been seen in the row. 301 */ 302 int 303 gettokens(Tokenrow *trp, int reset) 304 { 305 register int c, state, oldstate; 306 register uchar *ip; 307 register Token *tp, *maxp; 308 int runelen; 309 Source *s = cursource; 310 int nmac = 0; 311 extern char outbuf[]; 312 313 tp = trp->lp; 314 ip = s->inp; 315 if (reset) { 316 s->lineinc = 0; 317 if (ip>=s->inl) { /* nothing in buffer */ 318 s->inl = s->inb; 319 fillbuf(s); 320 ip = s->inp = s->inb; 321 } else if (ip >= s->inb+(3*INS/4)) { 322 memmove(s->inb, ip, 4+s->inl-ip); 323 s->inl = s->inb+(s->inl-ip); 324 ip = s->inp = s->inb; 325 } 326 } 327 maxp = &trp->bp[trp->max]; 328 runelen = 1; 329 for (;;) { 330 continue2: 331 if (tp>=maxp) { 332 trp->lp = tp; 333 tp = growtokenrow(trp); 334 maxp = &trp->bp[trp->max]; 335 } 336 tp->type = UNCLASS; 337 tp->hideset = 0; 338 tp->t = ip; 339 tp->wslen = 0; 340 tp->flag = 0; 341 state = START; 342 for (;;) { 343 oldstate = state; 344 c = *ip; 345 if ((state = bigfsm[c][state]) >= 0) { 346 ip += runelen; 347 runelen = 1; 348 continue; 349 } 350 state = ~state; 351 reswitch: 352 switch (state&0177) { 353 case S_SELF: 354 ip += runelen; 355 runelen = 1; 356 case S_SELFB: 357 tp->type = GETACT(state); 358 tp->len = ip - tp->t; 359 tp++; 360 goto continue2; 361 362 case S_NAME: /* like S_SELFB but with nmac check */ 363 tp->type = NAME; 364 tp->len = ip - tp->t; 365 nmac |= quicklook(tp->t[0], tp->len>1?tp->t[1]:0); 366 tp++; 367 goto continue2; 368 369 case S_WS: 370 tp->wslen = ip - tp->t; 371 tp->t = ip; 372 state = START; 373 continue; 374 375 default: 376 if ((state&QBSBIT)==0) { 377 ip += runelen; 378 runelen = 1; 379 continue; 380 } 381 state &= ~QBSBIT; 382 s->inp = ip; 383 if (c=='?') { /* check trigraph */ 384 if (trigraph(s)) { 385 state = oldstate; 386 continue; 387 } 388 goto reswitch; 389 } 390 if (c=='\\') { /* line-folding */ 391 if (foldline(s)) { 392 s->lineinc++; 393 state = oldstate; 394 continue; 395 } 396 goto reswitch; 397 } 398 if (UTF2(c)) { 399 runelen = 2; 400 goto reswitch; 401 } 402 if (UTF3(c)) { 403 runelen = 3; 404 goto reswitch; 405 } 406 error(WARNING, "Lexical botch in cpp"); 407 ip += runelen; 408 runelen = 1; 409 continue; 410 411 case S_EOB: 412 s->inp = ip; 413 fillbuf(cursource); 414 state = oldstate; 415 continue; 416 417 case S_EOF: 418 tp->type = END; 419 tp->len = 0; 420 s->inp = ip; 421 if (tp!=trp->bp && (tp-1)->type!=NL && cursource->fd!=-1) 422 error(WARNING,"No newline at end of file"); 423 trp->lp = tp+1; 424 return nmac; 425 426 case S_STNL: 427 error(ERROR, "Unterminated string or char const"); 428 case S_NL: 429 tp->t = ip; 430 tp->type = NL; 431 tp->len = 1; 432 tp->wslen = 0; 433 s->lineinc++; 434 s->inp = ip+1; 435 trp->lp = tp+1; 436 return nmac; 437 438 case S_EOFSTR: 439 error(FATAL, "EOF in string or char constant"); 440 break; 441 442 case S_COMNL: 443 s->lineinc++; 444 state = COM2; 445 ip += runelen; 446 runelen = 1; 447 continue; 448 449 case S_EOFCOM: 450 error(WARNING, "EOF inside comment"); 451 --ip; 452 case S_COMMENT: 453 ++ip; 454 tp->t = ip; 455 tp->t[-1] = ' '; 456 tp->wslen = 1; 457 state = START; 458 continue; 459 } 460 break; 461 } 462 ip += runelen; 463 runelen = 1; 464 tp->len = ip - tp->t; 465 tp++; 466 } 467 return 0; 468 } 469 470 /* have seen ?; handle the trigraph it starts (if any) else 0 */ 471 int 472 trigraph(Source *s) 473 { 474 int c; 475 476 while (s->inp+2 >= s->inl && fillbuf(s)!=EOF) 477 ; 478 if (s->inp[1]!='?') 479 return 0; 480 c = 0; 481 switch(s->inp[2]) { 482 case '=': 483 c = '#'; break; 484 case '(': 485 c = '['; break; 486 case '/': 487 c = '\\'; break; 488 case ')': 489 c = ']'; break; 490 case '\'': 491 c = '^'; break; 492 case '<': 493 c = '{'; break; 494 case '!': 495 c = '|'; break; 496 case '>': 497 c = '}'; break; 498 case '-': 499 c = '~'; break; 500 } 501 if (c) { 502 *s->inp = c; 503 memmove(s->inp+1, s->inp+3, s->inl-s->inp+2); 504 s->inl -= 2; 505 } 506 return c; 507 } 508 509 int 510 foldline(Source *s) 511 { 512 while (s->inp+1 >= s->inl && fillbuf(s)!=EOF) 513 ; 514 if (s->inp[1] == '\n') { 515 memmove(s->inp, s->inp+2, s->inl-s->inp+3); 516 s->inl -= 2; 517 return 1; 518 } 519 return 0; 520 } 521 522 int 523 fillbuf(Source *s) 524 { 525 int n; 526 527 if (s->fd<0 || (n=read(s->fd, (char *)s->inl, INS/8)) <= 0) 528 n = 0; 529 s->inl += n; 530 s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOB; 531 if (n==0) { 532 s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOFC; 533 return EOF; 534 } 535 return 0; 536 } 537 538 /* 539 * Push down to new source of characters. 540 * If fd>0 and str==NULL, then from a file `name'; 541 * if fd==-1 and str, then from the string. 542 */ 543 Source * 544 setsource(char *name, int fd, char *str) 545 { 546 Source *s = new(Source); 547 int len; 548 549 s->line = 1; 550 s->lineinc = 0; 551 s->fd = fd; 552 s->filename = name; 553 s->next = cursource; 554 s->ifdepth = 0; 555 cursource = s; 556 /* slop at right for EOB */ 557 if (str) { 558 len = strlen(str); 559 s->inb = domalloc(len+4); 560 s->inp = s->inb; 561 strncpy((char *)s->inp, str, len); 562 } else { 563 Dir d; 564 if (dirfstat(fd, &d) < 0) 565 d.length = 0; 566 s->inb = domalloc((d.length<INS? INS: d.length)+4); 567 s->inp = s->inb; 568 len = 0; 569 } 570 s->inl = s->inp+len; 571 s->inl[0] = s->inl[1] = EOB; 572 return s; 573 } 574 575 void 576 unsetsource(void) 577 { 578 Source *s = cursource; 579 580 if (s->fd>=0) { 581 close(s->fd); 582 dofree(s->inb); 583 } 584 cursource = s->next; 585 dofree(s); 586 } 587