1 #include <u.h> 2 #include <libc.h> 3 #include <stdio.h> 4 #include "cpp.h" 5 6 /* 7 * lexical FSM encoding 8 * when in state state, and one of the characters 9 * in ch arrives, enter nextstate. 10 * States >= S_SELF are either final, or at least require special action. 11 * In 'fsm' there is a line for each state X charset X nextstate. 12 * List chars that overwrite previous entries later (e.g. C_ALPH 13 * can be overridden by '_' by a later entry; and C_XX is the 14 * the universal set, and should always be first. 15 * States above S_SELF are represented in the big table as negative values. 16 * S_SELF and S_SELFB encode the resulting token type in the upper bits. 17 * These actions differ in that S_SELF doesn't have a lookahead char, 18 * S_SELFB does. 19 * 20 * The encoding is blown out into a big table for time-efficiency. 21 * Entries have 22 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits. 23 */ 24 25 #define MAXSTATE 32 26 #define ACT(tok,act) ((tok<<7)+act) 27 #define QBSBIT 0100 28 #define GETACT(st) (st>>7)&0x1ff 29 30 #define UTF2(c) ((c)>=0xA0 && (c)<0xE0) /* 2-char UTF seq */ 31 #define UTF3(c) ((c)>=0xE0 && (c)<0xF0) /* 3-char UTF seq */ 32 33 /* character classes */ 34 #define C_WS 1 35 #define C_ALPH 2 36 #define C_NUM 3 37 #define C_EOF 4 38 #define C_XX 5 39 40 enum state { 41 START=0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4, 42 CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1, 43 CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1, 44 S_SELF=MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR, 45 S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME 46 }; 47 48 int tottok; 49 int tokkind[256]; 50 struct fsm { 51 int state; /* if in this state */ 52 uchar ch[4]; /* and see one of these characters */ 53 int nextstate; /* enter this state if +ve */ 54 }; 55 56 /*const*/ struct fsm fsm[] = { 57 /* start state */ 58 START, { C_XX }, ACT(UNCLASS,S_SELF), 59 START, { ' ', '\t', '\v' }, WS1, 60 START, { C_NUM }, NUM1, 61 START, { '.' }, NUM3, 62 START, { C_ALPH }, ID1, 63 START, { 'L' }, ST1, 64 START, { '"' }, ST2, 65 START, { '\'' }, CC1, 66 START, { '/' }, COM1, 67 START, { EOF }, S_EOF, 68 START, { '\n' }, S_NL, 69 START, { '-' }, MINUS1, 70 START, { '+' }, PLUS1, 71 START, { '<' }, LT1, 72 START, { '>' }, GT1, 73 START, { '=' }, ASG1, 74 START, { '!' }, NOT1, 75 START, { '&' }, AND1, 76 START, { '|' }, OR1, 77 START, { '#' }, SHARP1, 78 START, { '%' }, PCT1, 79 START, { '[' }, ACT(SBRA,S_SELF), 80 START, { ']' }, ACT(SKET,S_SELF), 81 START, { '(' }, ACT(LP,S_SELF), 82 START, { ')' }, ACT(RP,S_SELF), 83 START, { '*' }, STAR1, 84 START, { ',' }, ACT(COMMA,S_SELF), 85 START, { '?' }, ACT(QUEST,S_SELF), 86 START, { ':' }, ACT(COLON,S_SELF), 87 START, { ';' }, ACT(SEMIC,S_SELF), 88 START, { '{' }, ACT(CBRA,S_SELF), 89 START, { '}' }, ACT(CKET,S_SELF), 90 START, { '~' }, ACT(TILDE,S_SELF), 91 START, { '^' }, CIRC1, 92 93 /* saw a digit */ 94 NUM1, { C_XX }, ACT(NUMBER,S_SELFB), 95 NUM1, { C_NUM, C_ALPH, '.' }, NUM1, 96 NUM1, { 'E', 'e' }, NUM2, 97 NUM1, { '_' }, ACT(NUMBER,S_SELFB), 98 99 /* saw possible start of exponent, digits-e */ 100 NUM2, { C_XX }, ACT(NUMBER,S_SELFB), 101 NUM2, { '+', '-' }, NUM1, 102 NUM2, { C_NUM, C_ALPH }, NUM1, 103 NUM2, { '_' }, ACT(NUMBER,S_SELFB), 104 105 /* saw a '.', which could be a number or an operator */ 106 NUM3, { C_XX }, ACT(DOT,S_SELFB), 107 NUM3, { '.' }, DOTS1, 108 NUM3, { C_NUM }, NUM1, 109 110 DOTS1, { C_XX }, ACT(UNCLASS, S_SELFB), 111 DOTS1, { C_NUM }, NUM1, 112 DOTS1, { '.' }, ACT(ELLIPS, S_SELF), 113 114 /* saw a letter or _ */ 115 ID1, { C_XX }, ACT(NAME,S_NAME), 116 ID1, { C_ALPH, C_NUM }, ID1, 117 118 /* saw L (start of wide string?) */ 119 ST1, { C_XX }, ACT(NAME,S_NAME), 120 ST1, { C_ALPH, C_NUM }, ID1, 121 ST1, { '"' }, ST2, 122 ST1, { '\'' }, CC1, 123 124 /* saw " beginning string */ 125 ST2, { C_XX }, ST2, 126 ST2, { '"' }, ACT(STRING, S_SELF), 127 ST2, { '\\' }, ST3, 128 ST2, { '\n' }, S_STNL, 129 ST1, { EOF }, S_EOFSTR, 130 131 /* saw \ in string */ 132 ST3, { C_XX }, ST2, 133 ST3, { '\n' }, S_STNL, 134 ST3, { EOF }, S_EOFSTR, 135 136 /* saw ' beginning character const */ 137 CC1, { C_XX }, CC1, 138 CC1, { '\'' }, ACT(CCON, S_SELF), 139 CC1, { '\\' }, CC2, 140 CC1, { '\n' }, S_STNL, 141 CC1, { EOF }, S_EOFSTR, 142 143 /* saw \ in ccon */ 144 CC2, { C_XX }, CC1, 145 CC2, { '\n' }, S_STNL, 146 CC2, { EOF }, S_EOFSTR, 147 148 /* saw /, perhaps start of comment */ 149 COM1, { C_XX }, ACT(SLASH, S_SELFB), 150 COM1, { '=' }, ACT(ASSLASH, S_SELF), 151 COM1, { '*' }, COM2, 152 COM1, { '/' }, COM4, 153 154 /* saw "/*", start of comment */ 155 COM2, { C_XX }, COM2, 156 COM2, { '\n' }, S_COMNL, 157 COM2, { '*' }, COM3, 158 COM2, { EOF }, S_EOFCOM, 159 160 /* saw the * possibly ending a comment */ 161 COM3, { C_XX }, COM2, 162 COM3, { '\n' }, S_COMNL, 163 COM3, { '*' }, COM3, 164 COM3, { '/' }, S_COMMENT, 165 166 /* // comment */ 167 COM4, { C_XX }, COM4, 168 COM4, { '\n' }, S_NL, 169 COM4, { EOF }, S_EOFCOM, 170 171 /* saw white space, eat it up */ 172 WS1, { C_XX }, S_WS, 173 WS1, { ' ', '\t', '\v' }, WS1, 174 175 /* saw -, check --, -=, -> */ 176 MINUS1, { C_XX }, ACT(MINUS, S_SELFB), 177 MINUS1, { '-' }, ACT(MMINUS, S_SELF), 178 MINUS1, { '=' }, ACT(ASMINUS,S_SELF), 179 MINUS1, { '>' }, ACT(ARROW,S_SELF), 180 181 /* saw +, check ++, += */ 182 PLUS1, { C_XX }, ACT(PLUS, S_SELFB), 183 PLUS1, { '+' }, ACT(PPLUS, S_SELF), 184 PLUS1, { '=' }, ACT(ASPLUS, S_SELF), 185 186 /* saw <, check <<, <<=, <= */ 187 LT1, { C_XX }, ACT(LT, S_SELFB), 188 LT1, { '<' }, LT2, 189 LT1, { '=' }, ACT(LEQ, S_SELF), 190 LT2, { C_XX }, ACT(LSH, S_SELFB), 191 LT2, { '=' }, ACT(ASLSH, S_SELF), 192 193 /* saw >, check >>, >>=, >= */ 194 GT1, { C_XX }, ACT(GT, S_SELFB), 195 GT1, { '>' }, GT2, 196 GT1, { '=' }, ACT(GEQ, S_SELF), 197 GT2, { C_XX }, ACT(RSH, S_SELFB), 198 GT2, { '=' }, ACT(ASRSH, S_SELF), 199 200 /* = */ 201 ASG1, { C_XX }, ACT(ASGN, S_SELFB), 202 ASG1, { '=' }, ACT(EQ, S_SELF), 203 204 /* ! */ 205 NOT1, { C_XX }, ACT(NOT, S_SELFB), 206 NOT1, { '=' }, ACT(NEQ, S_SELF), 207 208 /* & */ 209 AND1, { C_XX }, ACT(AND, S_SELFB), 210 AND1, { '&' }, ACT(LAND, S_SELF), 211 AND1, { '=' }, ACT(ASAND, S_SELF), 212 213 /* | */ 214 OR1, { C_XX }, ACT(OR, S_SELFB), 215 OR1, { '|' }, ACT(LOR, S_SELF), 216 OR1, { '=' }, ACT(ASOR, S_SELF), 217 218 /* # */ 219 SHARP1, { C_XX }, ACT(SHARP, S_SELFB), 220 SHARP1, { '#' }, ACT(DSHARP, S_SELF), 221 222 /* % */ 223 PCT1, { C_XX }, ACT(PCT, S_SELFB), 224 PCT1, { '=' }, ACT(ASPCT, S_SELF), 225 226 /* * */ 227 STAR1, { C_XX }, ACT(STAR, S_SELFB), 228 STAR1, { '=' }, ACT(ASSTAR, S_SELF), 229 230 /* ^ */ 231 CIRC1, { C_XX }, ACT(CIRC, S_SELFB), 232 CIRC1, { '=' }, ACT(ASCIRC, S_SELF), 233 234 -1 235 }; 236 237 /* first index is char+1 (to include EOF), second is state */ 238 /* increase #states to power of 2 to encourage use of shift */ 239 short bigfsm[257][MAXSTATE]; 240 241 void 242 expandlex(void) 243 { 244 /*const*/ struct fsm *fp; 245 int i, j, nstate; 246 247 for (fp = fsm; fp->state>=0; fp++) { 248 for (i=0; fp->ch[i]; i++) { 249 nstate = fp->nextstate; 250 if (nstate >= S_SELF) 251 nstate = ~nstate; 252 switch (fp->ch[i]) { 253 254 case C_XX: /* random characters */ 255 for (j=0; j<257; j++) 256 bigfsm[j][fp->state] = nstate; 257 continue; 258 case C_ALPH: 259 for (j=0; j<=255; j++) 260 if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z' 261 || UTF2(j) || UTF3(j) || j=='_') 262 bigfsm[j+1][fp->state] = nstate; 263 continue; 264 case C_NUM: 265 for (j='0'; j<='9'; j++) 266 bigfsm[j+1][fp->state] = nstate; 267 continue; 268 default: 269 bigfsm[fp->ch[i]+1][fp->state] = nstate; 270 } 271 } 272 } 273 /* install special cases for ? (trigraphs), \ (splicing), runes, and EOB */ 274 for (i=0; i<MAXSTATE; i++) { 275 for (j=0; j<0xFF; j++) 276 if (j=='?' || j=='\\' || UTF2(j) || UTF3(j)) { 277 if (bigfsm[j+1][i]>0) 278 bigfsm[j+1][i] = ~bigfsm[j+1][i]; 279 bigfsm[j+1][i] &= ~QBSBIT; 280 } 281 bigfsm[EOB+1][i] = ~S_EOB; 282 } 283 } 284 285 void 286 fixlex(void) 287 { 288 /* do C++ comments? */ 289 if (Cplusplus==0) 290 bigfsm['/'+1][COM1] = bigfsm['x'+1][COM1]; 291 } 292 293 /* 294 * fill in a row of tokens from input, terminated by NL or END 295 * First token is put at trp->lp. 296 * Reset is non-zero when the input buffer can be "rewound." 297 * The value is a flag indicating that possible macros have 298 * been seen in the row. 299 */ 300 int 301 gettokens(Tokenrow *trp, int reset) 302 { 303 register int c, state, oldstate; 304 register uchar *ip; 305 register Token *tp, *maxp; 306 int runelen; 307 Source *s = cursource; 308 int nmac = 0; 309 extern char outbuf[]; 310 311 tp = trp->lp; 312 ip = s->inp; 313 if (reset) { 314 s->lineinc = 0; 315 if (ip>=s->inl) { /* nothing in buffer */ 316 s->inl = s->inb; 317 fillbuf(s); 318 ip = s->inp = s->inb; 319 } else if (ip >= s->inb+(3*INS/4)) { 320 memmove(s->inb, ip, 4+s->inl-ip); 321 s->inl = s->inb+(s->inl-ip); 322 ip = s->inp = s->inb; 323 } 324 } 325 maxp = &trp->bp[trp->max]; 326 runelen = 1; 327 for (;;) { 328 continue2: 329 if (tp>=maxp) { 330 trp->lp = tp; 331 tp = growtokenrow(trp); 332 maxp = &trp->bp[trp->max]; 333 } 334 tp->type = UNCLASS; 335 tp->hideset = 0; 336 tp->t = ip; 337 tp->wslen = 0; 338 tp->flag = 0; 339 state = START; 340 for (;;) { 341 oldstate = state; 342 c = *ip; 343 if ((state = bigfsm[c+1][state]) >= 0) { 344 ip += runelen; 345 runelen = 1; 346 continue; 347 } 348 state = ~state; 349 reswitch: 350 switch (state&0177) { 351 case S_SELF: 352 ip += runelen; 353 runelen = 1; 354 case S_SELFB: 355 tp->type = GETACT(state); 356 tp->len = ip - tp->t; 357 tp++; 358 goto continue2; 359 360 case S_NAME: /* like S_SELFB but with nmac check */ 361 tp->type = NAME; 362 tp->len = ip - tp->t; 363 nmac |= quicklook(tp->t[0], tp->len>1?tp->t[1]:0); 364 tp++; 365 goto continue2; 366 367 case S_WS: 368 tp->wslen = ip - tp->t; 369 tp->t = ip; 370 state = START; 371 continue; 372 373 default: 374 if ((state&QBSBIT)==0) { 375 ip += runelen; 376 runelen = 1; 377 continue; 378 } 379 state &= ~QBSBIT; 380 s->inp = ip; 381 if (c=='?') { /* check trigraph */ 382 if (trigraph(s)) { 383 state = oldstate; 384 continue; 385 } 386 goto reswitch; 387 } 388 if (c=='\\') { /* line-folding */ 389 if (foldline(s)) { 390 s->lineinc++; 391 state = oldstate; 392 continue; 393 } 394 goto reswitch; 395 } 396 if (UTF2(c)) { 397 runelen = 2; 398 goto reswitch; 399 } 400 if (UTF3(c)) { 401 runelen = 3; 402 goto reswitch; 403 } 404 error(WARNING, "Lexical botch in cpp"); 405 ip += runelen; 406 runelen = 1; 407 continue; 408 409 case S_EOB: 410 s->inp = ip; 411 fillbuf(cursource); 412 state = oldstate; 413 continue; 414 415 case S_EOF: 416 tp->type = END; 417 tp->len = 0; 418 s->inp = ip; 419 if (tp!=trp->bp && (tp-1)->type!=NL && cursource->fd!=-1) 420 error(WARNING,"No newline at end of file"); 421 trp->lp = tp+1; 422 return nmac; 423 424 case S_STNL: 425 error(ERROR, "Unterminated string or char const"); 426 case S_NL: 427 tp->t = ip; 428 tp->type = NL; 429 tp->len = 1; 430 tp->wslen = 0; 431 s->lineinc++; 432 s->inp = ip+1; 433 trp->lp = tp+1; 434 return nmac; 435 436 case S_EOFSTR: 437 error(FATAL, "EOF in string or char constant"); 438 break; 439 440 case S_COMNL: 441 s->lineinc++; 442 state = COM2; 443 ip += runelen; 444 runelen = 1; 445 continue; 446 447 case S_EOFCOM: 448 error(WARNING, "EOF inside comment"); 449 --ip; 450 case S_COMMENT: 451 ++ip; 452 tp->t = ip; 453 tp->t[-1] = ' '; 454 tp->wslen = 1; 455 state = START; 456 continue; 457 } 458 break; 459 } 460 ip += runelen; 461 runelen = 1; 462 tp->len = ip - tp->t; 463 tp++; 464 } 465 } 466 467 /* have seen ?; handle the trigraph it starts (if any) else 0 */ 468 int 469 trigraph(Source *s) 470 { 471 int c; 472 473 while (s->inp+2 >= s->inl && fillbuf(s)!=EOF) 474 ; 475 if (s->inp[1]!='?') 476 return 0; 477 c = 0; 478 switch(s->inp[2]) { 479 case '=': 480 c = '#'; break; 481 case '(': 482 c = '['; break; 483 case '/': 484 c = '\\'; break; 485 case ')': 486 c = ']'; break; 487 case '\'': 488 c = '^'; break; 489 case '<': 490 c = '{'; break; 491 case '!': 492 c = '|'; break; 493 case '>': 494 c = '}'; break; 495 case '-': 496 c = '~'; break; 497 } 498 if (c) { 499 *s->inp = c; 500 memmove(s->inp+1, s->inp+3, s->inl-s->inp+2); 501 s->inl -= 2; 502 } 503 return c; 504 } 505 506 int 507 foldline(Source *s) 508 { 509 while (s->inp+1 >= s->inl && fillbuf(s)!=EOF) 510 ; 511 if (s->inp[1] == '\n') { 512 memmove(s->inp, s->inp+2, s->inl-s->inp+3); 513 s->inl -= 2; 514 return 1; 515 } 516 return 0; 517 } 518 519 int 520 fillbuf(Source *s) 521 { 522 int n; 523 524 if (s->fd<0) 525 n = 0; 526 else if ((n=read(s->fd, (char *)s->inl, INS/8)) <= 0) 527 n = 0; 528 s->inl += n; 529 s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOB; 530 if (n==0) { 531 s->inl[0] = EOF; 532 return EOF; 533 } 534 return 0; 535 } 536 537 /* 538 * Push down to new source of characters. 539 * If fd>0 and str==NULL, then from a file `name'; 540 * if fd==-1 and str, then from the string. 541 */ 542 Source * 543 setsource(char *name, int fd, char *str) 544 { 545 Source *s = new(Source); 546 int len; 547 548 s->line = 1; 549 s->lineinc = 0; 550 s->fd = fd; 551 s->filename = name; 552 /* slop at right for EOB */ 553 if (str) { 554 len = strlen(str); 555 s->inb = domalloc(len+4); 556 s->inp = s->inb; 557 strncpy((char *)s->inp, str, len); 558 } else { 559 s->inb = domalloc(INS+4); 560 s->inp = s->inb; 561 len = 0; 562 } 563 s->inl = s->inp+len; 564 s->inl[0] = s->inl[1] = EOB; 565 s->next = cursource; 566 s->ifdepth = 0; 567 cursource = s; 568 return s; 569 } 570 571 void 572 unsetsource(void) 573 { 574 Source *s = cursource; 575 576 if (s->fd>=0) { 577 close(s->fd); 578 dofree(s->inb); 579 } 580 cursource = s->next; 581 dofree(s); 582 } 583