1 /* $NetBSD: llex.c,v 1.7 2016/01/28 14:41:39 lneto Exp $ */ 2 3 /* 4 ** Id: llex.c,v 2.95 2015/11/19 19:16:22 roberto Exp 5 ** Lexical Analyzer 6 ** See Copyright Notice in lua.h 7 */ 8 9 #define llex_c 10 #define LUA_CORE 11 12 #include "lprefix.h" 13 14 15 #ifndef _KERNEL 16 #include <locale.h> 17 #include <string.h> 18 #endif /* _KERNEL */ 19 20 #include "lua.h" 21 22 #include "lctype.h" 23 #include "ldebug.h" 24 #include "ldo.h" 25 #include "lgc.h" 26 #include "llex.h" 27 #include "lobject.h" 28 #include "lparser.h" 29 #include "lstate.h" 30 #include "lstring.h" 31 #include "ltable.h" 32 #include "lzio.h" 33 34 35 36 #define next(ls) (ls->current = zgetc(ls->z)) 37 38 39 40 #define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r') 41 42 43 /* ORDER RESERVED */ 44 static const char *const luaX_tokens [] = { 45 "and", "break", "do", "else", "elseif", 46 "end", "false", "for", "function", "goto", "if", 47 "in", "local", "nil", "not", "or", "repeat", 48 "return", "then", "true", "until", "while", 49 "//", "..", "...", "==", ">=", "<=", "~=", 50 "<<", ">>", "::", "<eof>", 51 "<number>", "<integer>", "<name>", "<string>" 52 }; 53 54 55 #define save_and_next(ls) (save(ls, ls->current), next(ls)) 56 57 58 static l_noret lexerror (LexState *ls, const char *msg, int token); 59 60 61 static void save (LexState *ls, int c) { 62 Mbuffer *b = ls->buff; 63 if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) { 64 size_t newsize; 65 if (luaZ_sizebuffer(b) >= MAX_SIZE/2) 66 lexerror(ls, "lexical element too long", 0); 67 newsize = luaZ_sizebuffer(b) * 2; 68 luaZ_resizebuffer(ls->L, b, newsize); 69 } 70 b->buffer[luaZ_bufflen(b)++] = cast(char, c); 71 } 72 73 74 void luaX_init (lua_State *L) { 75 int i; 76 TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */ 77 luaC_fix(L, obj2gco(e)); /* never collect this name */ 78 for (i=0; i<NUM_RESERVED; i++) { 79 TString *ts = luaS_new(L, luaX_tokens[i]); 80 luaC_fix(L, obj2gco(ts)); /* reserved words are never collected */ 81 ts->extra = cast_byte(i+1); /* reserved word */ 82 } 83 } 84 85 86 const char *luaX_token2str (LexState *ls, int token) { 87 if (token < FIRST_RESERVED) { /* single-byte symbols? */ 88 lua_assert(token == cast_uchar(token)); 89 return luaO_pushfstring(ls->L, "'%c'", token); 90 } 91 else { 92 const char *s = luaX_tokens[token - FIRST_RESERVED]; 93 if (token < TK_EOS) /* fixed format (symbols and reserved words)? */ 94 return luaO_pushfstring(ls->L, "'%s'", s); 95 else /* names, strings, and numerals */ 96 return s; 97 } 98 } 99 100 101 static const char *txtToken (LexState *ls, int token) { 102 switch (token) { 103 case TK_NAME: case TK_STRING: 104 #ifndef _KERNEL 105 case TK_FLT: case TK_INT: 106 #else /* _KERNEL */ 107 case TK_INT: 108 #endif /* _KERNEL */ 109 save(ls, '\0'); 110 return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff)); 111 default: 112 return luaX_token2str(ls, token); 113 } 114 } 115 116 117 static l_noret lexerror (LexState *ls, const char *msg, int token) { 118 msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber); 119 if (token) 120 luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token)); 121 luaD_throw(ls->L, LUA_ERRSYNTAX); 122 } 123 124 125 l_noret luaX_syntaxerror (LexState *ls, const char *msg) { 126 lexerror(ls, msg, ls->t.token); 127 } 128 129 130 /* 131 ** creates a new string and anchors it in scanner's table so that 132 ** it will not be collected until the end of the compilation 133 ** (by that time it should be anchored somewhere) 134 */ 135 TString *luaX_newstring (LexState *ls, const char *str, size_t l) { 136 lua_State *L = ls->L; 137 TValue *o; /* entry for 'str' */ 138 TString *ts = luaS_newlstr(L, str, l); /* create new string */ 139 setsvalue2s(L, L->top++, ts); /* temporarily anchor it in stack */ 140 o = luaH_set(L, ls->h, L->top - 1); 141 if (ttisnil(o)) { /* not in use yet? */ 142 /* boolean value does not need GC barrier; 143 table has no metatable, so it does not need to invalidate cache */ 144 setbvalue(o, 1); /* t[string] = true */ 145 luaC_checkGC(L); 146 } 147 else { /* string already present */ 148 ts = tsvalue(keyfromval(o)); /* re-use value previously stored */ 149 } 150 L->top--; /* remove string from stack */ 151 return ts; 152 } 153 154 155 /* 156 ** increment line number and skips newline sequence (any of 157 ** \n, \r, \n\r, or \r\n) 158 */ 159 static void inclinenumber (LexState *ls) { 160 int old = ls->current; 161 lua_assert(currIsNewline(ls)); 162 next(ls); /* skip '\n' or '\r' */ 163 if (currIsNewline(ls) && ls->current != old) 164 next(ls); /* skip '\n\r' or '\r\n' */ 165 if (++ls->linenumber >= MAX_INT) 166 lexerror(ls, "chunk has too many lines", 0); 167 } 168 169 170 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source, 171 int firstchar) { 172 ls->t.token = 0; 173 ls->decpoint = '.'; 174 ls->L = L; 175 ls->current = firstchar; 176 ls->lookahead.token = TK_EOS; /* no look-ahead token */ 177 ls->z = z; 178 ls->fs = NULL; 179 ls->linenumber = 1; 180 ls->lastline = 1; 181 ls->source = source; 182 ls->envn = luaS_newliteral(L, LUA_ENV); /* get env name */ 183 luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */ 184 } 185 186 187 188 /* 189 ** ======================================================= 190 ** LEXICAL ANALYZER 191 ** ======================================================= 192 */ 193 194 195 static int check_next1 (LexState *ls, int c) { 196 if (ls->current == c) { 197 next(ls); 198 return 1; 199 } 200 else return 0; 201 } 202 203 204 /* 205 ** Check whether current char is in set 'set' (with two chars) and 206 ** saves it 207 */ 208 static int check_next2 (LexState *ls, const char *set) { 209 lua_assert(set[2] == '\0'); 210 if (ls->current == set[0] || ls->current == set[1]) { 211 save_and_next(ls); 212 return 1; 213 } 214 else return 0; 215 } 216 217 218 #ifndef _KERNEL 219 /* 220 ** change all characters 'from' in buffer to 'to' 221 */ 222 static void buffreplace (LexState *ls, char from, char to) { 223 if (from != to) { 224 size_t n = luaZ_bufflen(ls->buff); 225 char *p = luaZ_buffer(ls->buff); 226 while (n--) 227 if (p[n] == from) p[n] = to; 228 } 229 } 230 231 232 /* 233 ** in case of format error, try to change decimal point separator to 234 ** the one defined in the current locale and check again 235 */ 236 static void trydecpoint (LexState *ls, TValue *o) { 237 char old = ls->decpoint; 238 ls->decpoint = lua_getlocaledecpoint(); 239 buffreplace(ls, old, ls->decpoint); /* try new decimal separator */ 240 if (luaO_str2num(luaZ_buffer(ls->buff), o) == 0) { 241 /* format error with correct decimal point: no more options */ 242 buffreplace(ls, ls->decpoint, '.'); /* undo change (for error message) */ 243 lexerror(ls, "malformed number", TK_FLT); 244 } 245 } 246 247 248 /* LUA_NUMBER */ 249 /* 250 ** this function is quite liberal in what it accepts, as 'luaO_str2num' 251 ** will reject ill-formed numerals. 252 */ 253 static int read_numeral (LexState *ls, SemInfo *seminfo) { 254 TValue obj; 255 const char *expo = "Ee"; 256 int first = ls->current; 257 lua_assert(lisdigit(ls->current)); 258 save_and_next(ls); 259 if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */ 260 expo = "Pp"; 261 for (;;) { 262 if (check_next2(ls, expo)) /* exponent part? */ 263 check_next2(ls, "-+"); /* optional exponent sign */ 264 if (lisxdigit(ls->current)) 265 save_and_next(ls); 266 else if (ls->current == '.') 267 save_and_next(ls); 268 else break; 269 } 270 save(ls, '\0'); 271 buffreplace(ls, '.', ls->decpoint); /* follow locale for decimal point */ 272 if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */ 273 trydecpoint(ls, &obj); /* try to update decimal point separator */ 274 if (ttisinteger(&obj)) { 275 seminfo->i = ivalue(&obj); 276 return TK_INT; 277 } 278 else { 279 lua_assert(ttisfloat(&obj)); 280 seminfo->r = fltvalue(&obj); 281 return TK_FLT; 282 } 283 } 284 285 #else /* _KERNEL */ 286 287 static int read_numeral (LexState *ls, SemInfo *seminfo) { 288 TValue obj; 289 int first = ls->current; 290 lua_assert(lisdigit(ls->current)); 291 save_and_next(ls); 292 if (first == '0') 293 check_next2(ls, "xX"); /* hexadecimal? */ 294 for (;;) { 295 if (lisxdigit(ls->current)) 296 save_and_next(ls); 297 else break; 298 } 299 save(ls, '\0'); 300 if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */ 301 lexerror(ls, "malformed number", TK_INT); 302 lua_assert(ttisinteger(&obj)); 303 seminfo->i = ivalue(&obj); 304 return TK_INT; 305 } 306 #endif /* _KERNEL */ 307 308 /* 309 ** skip a sequence '[=*[' or ']=*]'; if sequence is well formed, return 310 ** its number of '='s; otherwise, return a negative number (-1 iff there 311 ** are no '='s after initial bracket) 312 */ 313 static int skip_sep (LexState *ls) { 314 int count = 0; 315 int s = ls->current; 316 lua_assert(s == '[' || s == ']'); 317 save_and_next(ls); 318 while (ls->current == '=') { 319 save_and_next(ls); 320 count++; 321 } 322 return (ls->current == s) ? count : (-count) - 1; 323 } 324 325 326 static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) { 327 int line = ls->linenumber; /* initial line (for error message) */ 328 save_and_next(ls); /* skip 2nd '[' */ 329 if (currIsNewline(ls)) /* string starts with a newline? */ 330 inclinenumber(ls); /* skip it */ 331 for (;;) { 332 switch (ls->current) { 333 case EOZ: { /* error */ 334 const char *what = (seminfo ? "string" : "comment"); 335 const char *msg = luaO_pushfstring(ls->L, 336 "unfinished long %s (starting at line %d)", what, line); 337 lexerror(ls, msg, TK_EOS); 338 break; /* to avoid warnings */ 339 } 340 case ']': { 341 if (skip_sep(ls) == sep) { 342 save_and_next(ls); /* skip 2nd ']' */ 343 goto endloop; 344 } 345 break; 346 } 347 case '\n': case '\r': { 348 save(ls, '\n'); 349 inclinenumber(ls); 350 if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */ 351 break; 352 } 353 default: { 354 if (seminfo) save_and_next(ls); 355 else next(ls); 356 } 357 } 358 } endloop: 359 if (seminfo) 360 seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep), 361 luaZ_bufflen(ls->buff) - 2*(2 + sep)); 362 } 363 364 365 static void esccheck (LexState *ls, int c, const char *msg) { 366 if (!c) { 367 if (ls->current != EOZ) 368 save_and_next(ls); /* add current to buffer for error message */ 369 lexerror(ls, msg, TK_STRING); 370 } 371 } 372 373 374 static int gethexa (LexState *ls) { 375 save_and_next(ls); 376 esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected"); 377 return luaO_hexavalue(ls->current); 378 } 379 380 381 static int readhexaesc (LexState *ls) { 382 int r = gethexa(ls); 383 r = (r << 4) + gethexa(ls); 384 luaZ_buffremove(ls->buff, 2); /* remove saved chars from buffer */ 385 return r; 386 } 387 388 389 static unsigned long readutf8esc (LexState *ls) { 390 unsigned long r; 391 int i = 4; /* chars to be removed: '\', 'u', '{', and first digit */ 392 save_and_next(ls); /* skip 'u' */ 393 esccheck(ls, ls->current == '{', "missing '{'"); 394 r = gethexa(ls); /* must have at least one digit */ 395 while ((save_and_next(ls), lisxdigit(ls->current))) { 396 i++; 397 r = (r << 4) + luaO_hexavalue(ls->current); 398 esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large"); 399 } 400 esccheck(ls, ls->current == '}', "missing '}'"); 401 next(ls); /* skip '}' */ 402 luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */ 403 return r; 404 } 405 406 407 static void utf8esc (LexState *ls) { 408 char buff[UTF8BUFFSZ]; 409 int n = luaO_utf8esc(buff, readutf8esc(ls)); 410 for (; n > 0; n--) /* add 'buff' to string */ 411 save(ls, buff[UTF8BUFFSZ - n]); 412 } 413 414 415 static int readdecesc (LexState *ls) { 416 int i; 417 int r = 0; /* result accumulator */ 418 for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */ 419 r = 10*r + ls->current - '0'; 420 save_and_next(ls); 421 } 422 esccheck(ls, r <= UCHAR_MAX, "decimal escape too large"); 423 luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */ 424 return r; 425 } 426 427 428 static void read_string (LexState *ls, int del, SemInfo *seminfo) { 429 save_and_next(ls); /* keep delimiter (for error messages) */ 430 while (ls->current != del) { 431 switch (ls->current) { 432 case EOZ: 433 lexerror(ls, "unfinished string", TK_EOS); 434 break; /* to avoid warnings */ 435 case '\n': 436 case '\r': 437 lexerror(ls, "unfinished string", TK_STRING); 438 break; /* to avoid warnings */ 439 case '\\': { /* escape sequences */ 440 int c; /* final character to be saved */ 441 save_and_next(ls); /* keep '\\' for error messages */ 442 switch (ls->current) { 443 case 'a': c = '\a'; goto read_save; 444 case 'b': c = '\b'; goto read_save; 445 case 'f': c = '\f'; goto read_save; 446 case 'n': c = '\n'; goto read_save; 447 case 'r': c = '\r'; goto read_save; 448 case 't': c = '\t'; goto read_save; 449 case 'v': c = '\v'; goto read_save; 450 case 'x': c = readhexaesc(ls); goto read_save; 451 case 'u': utf8esc(ls); goto no_save; 452 case '\n': case '\r': 453 inclinenumber(ls); c = '\n'; goto only_save; 454 case '\\': case '\"': case '\'': 455 c = ls->current; goto read_save; 456 case EOZ: goto no_save; /* will raise an error next loop */ 457 case 'z': { /* zap following span of spaces */ 458 luaZ_buffremove(ls->buff, 1); /* remove '\\' */ 459 next(ls); /* skip the 'z' */ 460 while (lisspace(ls->current)) { 461 if (currIsNewline(ls)) inclinenumber(ls); 462 else next(ls); 463 } 464 goto no_save; 465 } 466 default: { 467 esccheck(ls, lisdigit(ls->current), "invalid escape sequence"); 468 c = readdecesc(ls); /* digital escape '\ddd' */ 469 goto only_save; 470 } 471 } 472 read_save: 473 next(ls); 474 /* go through */ 475 only_save: 476 luaZ_buffremove(ls->buff, 1); /* remove '\\' */ 477 save(ls, c); 478 /* go through */ 479 no_save: break; 480 } 481 default: 482 save_and_next(ls); 483 } 484 } 485 save_and_next(ls); /* skip delimiter */ 486 seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1, 487 luaZ_bufflen(ls->buff) - 2); 488 } 489 490 491 static int llex (LexState *ls, SemInfo *seminfo) { 492 luaZ_resetbuffer(ls->buff); 493 for (;;) { 494 switch (ls->current) { 495 case '\n': case '\r': { /* line breaks */ 496 inclinenumber(ls); 497 break; 498 } 499 case ' ': case '\f': case '\t': case '\v': { /* spaces */ 500 next(ls); 501 break; 502 } 503 case '-': { /* '-' or '--' (comment) */ 504 next(ls); 505 if (ls->current != '-') return '-'; 506 /* else is a comment */ 507 next(ls); 508 if (ls->current == '[') { /* long comment? */ 509 int sep = skip_sep(ls); 510 luaZ_resetbuffer(ls->buff); /* 'skip_sep' may dirty the buffer */ 511 if (sep >= 0) { 512 read_long_string(ls, NULL, sep); /* skip long comment */ 513 luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */ 514 break; 515 } 516 } 517 /* else short comment */ 518 while (!currIsNewline(ls) && ls->current != EOZ) 519 next(ls); /* skip until end of line (or end of file) */ 520 break; 521 } 522 case '[': { /* long string or simply '[' */ 523 int sep = skip_sep(ls); 524 if (sep >= 0) { 525 read_long_string(ls, seminfo, sep); 526 return TK_STRING; 527 } 528 else if (sep != -1) /* '[=...' missing second bracket */ 529 lexerror(ls, "invalid long string delimiter", TK_STRING); 530 return '['; 531 } 532 case '=': { 533 next(ls); 534 if (check_next1(ls, '=')) return TK_EQ; 535 else return '='; 536 } 537 case '<': { 538 next(ls); 539 if (check_next1(ls, '=')) return TK_LE; 540 else if (check_next1(ls, '<')) return TK_SHL; 541 else return '<'; 542 } 543 case '>': { 544 next(ls); 545 if (check_next1(ls, '=')) return TK_GE; 546 else if (check_next1(ls, '>')) return TK_SHR; 547 else return '>'; 548 } 549 case '/': { 550 next(ls); 551 if (check_next1(ls, '/')) return TK_IDIV; 552 else return '/'; 553 } 554 case '~': { 555 next(ls); 556 if (check_next1(ls, '=')) return TK_NE; 557 else return '~'; 558 } 559 case ':': { 560 next(ls); 561 if (check_next1(ls, ':')) return TK_DBCOLON; 562 else return ':'; 563 } 564 case '"': case '\'': { /* short literal strings */ 565 read_string(ls, ls->current, seminfo); 566 return TK_STRING; 567 } 568 case '.': { /* '.', '..', '...', or number */ 569 save_and_next(ls); 570 if (check_next1(ls, '.')) { 571 if (check_next1(ls, '.')) 572 return TK_DOTS; /* '...' */ 573 else return TK_CONCAT; /* '..' */ 574 } 575 #ifndef _KERNEL 576 else if (!lisdigit(ls->current)) return '.'; 577 else return read_numeral(ls, seminfo); 578 #else /* _KERNEL */ 579 else return '.'; 580 #endif /* _KERNEL */ 581 } 582 case '0': case '1': case '2': case '3': case '4': 583 case '5': case '6': case '7': case '8': case '9': { 584 return read_numeral(ls, seminfo); 585 } 586 case EOZ: { 587 return TK_EOS; 588 } 589 default: { 590 if (lislalpha(ls->current)) { /* identifier or reserved word? */ 591 TString *ts; 592 do { 593 save_and_next(ls); 594 } while (lislalnum(ls->current)); 595 ts = luaX_newstring(ls, luaZ_buffer(ls->buff), 596 luaZ_bufflen(ls->buff)); 597 seminfo->ts = ts; 598 if (isreserved(ts)) /* reserved word? */ 599 return ts->extra - 1 + FIRST_RESERVED; 600 else { 601 return TK_NAME; 602 } 603 } 604 else { /* single-char tokens (+ - / ...) */ 605 int c = ls->current; 606 next(ls); 607 return c; 608 } 609 } 610 } 611 } 612 } 613 614 615 void luaX_next (LexState *ls) { 616 ls->lastline = ls->linenumber; 617 if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */ 618 ls->t = ls->lookahead; /* use this one */ 619 ls->lookahead.token = TK_EOS; /* and discharge it */ 620 } 621 else 622 ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */ 623 } 624 625 626 int luaX_lookahead (LexState *ls) { 627 lua_assert(ls->lookahead.token == TK_EOS); 628 ls->lookahead.token = llex(ls, &ls->lookahead.seminfo); 629 return ls->lookahead.token; 630 } 631 632