1 /* $NetBSD: llex.c,v 1.14 2023/06/08 21:12:08 nikita Exp $ */ 2 3 /* 4 ** Id: llex.c 5 ** Lexical Analyzer 6 ** See Copyright Notice in lua.h 7 */ 8 9 #define llex_c 10 #define LUA_CORE 11 12 #include "lprefix.h" 13 14 15 #ifndef _KERNEL 16 #include <locale.h> 17 #include <string.h> 18 #endif /* _KERNEL */ 19 20 #include "lua.h" 21 22 #include "lctype.h" 23 #include "ldebug.h" 24 #include "ldo.h" 25 #include "lgc.h" 26 #include "llex.h" 27 #include "lobject.h" 28 #include "lparser.h" 29 #include "lstate.h" 30 #include "lstring.h" 31 #include "ltable.h" 32 #include "lzio.h" 33 34 35 36 #define next(ls) (ls->current = zgetc(ls->z)) 37 38 39 40 #define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r') 41 42 43 /* ORDER RESERVED */ 44 static const char *const luaX_tokens [] = { 45 "and", "break", "do", "else", "elseif", 46 "end", "false", "for", "function", "goto", "if", 47 "in", "local", "nil", "not", "or", "repeat", 48 "return", "then", "true", "until", "while", 49 "//", "..", "...", "==", ">=", "<=", "~=", 50 "<<", ">>", "::", "<eof>", 51 "<number>", "<integer>", "<name>", "<string>" 52 }; 53 54 55 #define save_and_next(ls) (save(ls, ls->current), next(ls)) 56 57 58 static l_noret lexerror (LexState *ls, const char *msg, int token); 59 60 61 static void save (LexState *ls, int c) { 62 Mbuffer *b = ls->buff; 63 if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) { 64 size_t newsize; 65 if (luaZ_sizebuffer(b) >= MAX_SIZE/2) 66 lexerror(ls, "lexical element too long", 0); 67 newsize = luaZ_sizebuffer(b) * 2; 68 luaZ_resizebuffer(ls->L, b, newsize); 69 } 70 b->buffer[luaZ_bufflen(b)++] = cast_char(c); 71 } 72 73 74 void luaX_init (lua_State *L) { 75 int i; 76 TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */ 77 luaC_fix(L, obj2gco(e)); /* never collect this name */ 78 for (i=0; i<NUM_RESERVED; i++) { 79 TString *ts = luaS_new(L, luaX_tokens[i]); 80 luaC_fix(L, obj2gco(ts)); /* reserved words are never collected */ 81 ts->extra = cast_byte(i+1); /* reserved word */ 82 } 83 } 84 85 86 const char *luaX_token2str (LexState *ls, int token) { 87 if (token < FIRST_RESERVED) { /* single-byte symbols? */ 88 if (lisprint(token)) 89 return luaO_pushfstring(ls->L, "'%c'", token); 90 else /* control character */ 91 return luaO_pushfstring(ls->L, "'<\\%d>'", token); 92 } 93 else { 94 const char *s = luaX_tokens[token - FIRST_RESERVED]; 95 if (token < TK_EOS) /* fixed format (symbols and reserved words)? */ 96 return luaO_pushfstring(ls->L, "'%s'", s); 97 else /* names, strings, and numerals */ 98 return s; 99 } 100 } 101 102 103 static const char *txtToken (LexState *ls, int token) { 104 switch (token) { 105 case TK_NAME: case TK_STRING: 106 #ifndef _KERNEL 107 case TK_FLT: case TK_INT: 108 #else /* _KERNEL */ 109 case TK_INT: 110 #endif /* _KERNEL */ 111 save(ls, '\0'); 112 return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff)); 113 default: 114 return luaX_token2str(ls, token); 115 } 116 } 117 118 119 static l_noret lexerror (LexState *ls, const char *msg, int token) { 120 msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber); 121 if (token) 122 luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token)); 123 luaD_throw(ls->L, LUA_ERRSYNTAX); 124 } 125 126 127 l_noret luaX_syntaxerror (LexState *ls, const char *msg) { 128 lexerror(ls, msg, ls->t.token); 129 } 130 131 132 /* 133 ** Creates a new string and anchors it in scanner's table so that it 134 ** will not be collected until the end of the compilation; by that time 135 ** it should be anchored somewhere. It also internalizes long strings, 136 ** ensuring there is only one copy of each unique string. The table 137 ** here is used as a set: the string enters as the key, while its value 138 ** is irrelevant. We use the string itself as the value only because it 139 ** is a TValue readily available. Later, the code generation can change 140 ** this value. 141 */ 142 TString *luaX_newstring (LexState *ls, const char *str, size_t l) { 143 lua_State *L = ls->L; 144 TString *ts = luaS_newlstr(L, str, l); /* create new string */ 145 const TValue *o = luaH_getstr(ls->h, ts); 146 if (!ttisnil(o)) /* string already present? */ 147 ts = keystrval(nodefromval(o)); /* get saved copy */ 148 else { /* not in use yet */ 149 TValue *stv = s2v(L->top.p++); /* reserve stack space for string */ 150 setsvalue(L, stv, ts); /* temporarily anchor the string */ 151 luaH_finishset(L, ls->h, stv, o, stv); /* t[string] = string */ 152 /* table is not a metatable, so it does not need to invalidate cache */ 153 luaC_checkGC(L); 154 L->top.p--; /* remove string from stack */ 155 } 156 return ts; 157 } 158 159 160 /* 161 ** increment line number and skips newline sequence (any of 162 ** \n, \r, \n\r, or \r\n) 163 */ 164 static void inclinenumber (LexState *ls) { 165 int old = ls->current; 166 lua_assert(currIsNewline(ls)); 167 next(ls); /* skip '\n' or '\r' */ 168 if (currIsNewline(ls) && ls->current != old) 169 next(ls); /* skip '\n\r' or '\r\n' */ 170 if (++ls->linenumber >= MAX_INT) 171 lexerror(ls, "chunk has too many lines", 0); 172 } 173 174 175 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source, 176 int firstchar) { 177 ls->t.token = 0; 178 ls->L = L; 179 ls->current = firstchar; 180 ls->lookahead.token = TK_EOS; /* no look-ahead token */ 181 ls->z = z; 182 ls->fs = NULL; 183 ls->linenumber = 1; 184 ls->lastline = 1; 185 ls->source = source; 186 ls->envn = luaS_newliteral(L, LUA_ENV); /* get env name */ 187 luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */ 188 } 189 190 191 192 /* 193 ** ======================================================= 194 ** LEXICAL ANALYZER 195 ** ======================================================= 196 */ 197 198 199 static int check_next1 (LexState *ls, int c) { 200 if (ls->current == c) { 201 next(ls); 202 return 1; 203 } 204 else return 0; 205 } 206 207 208 /* 209 ** Check whether current char is in set 'set' (with two chars) and 210 ** saves it 211 */ 212 static int check_next2 (LexState *ls, const char *set) { 213 lua_assert(set[2] == '\0'); 214 if (ls->current == set[0] || ls->current == set[1]) { 215 save_and_next(ls); 216 return 1; 217 } 218 else return 0; 219 } 220 221 222 #ifndef _KERNEL 223 /* LUA_NUMBER */ 224 /* 225 ** This function is quite liberal in what it accepts, as 'luaO_str2num' 226 ** will reject ill-formed numerals. Roughly, it accepts the following 227 ** pattern: 228 ** 229 ** %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))* 230 ** 231 ** The only tricky part is to accept [+-] only after a valid exponent 232 ** mark, to avoid reading '3-4' or '0xe+1' as a single number. 233 ** 234 ** The caller might have already read an initial dot. 235 */ 236 static int read_numeral (LexState *ls, SemInfo *seminfo) { 237 TValue obj; 238 const char *expo = "Ee"; 239 int first = ls->current; 240 lua_assert(lisdigit(ls->current)); 241 save_and_next(ls); 242 if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */ 243 expo = "Pp"; 244 for (;;) { 245 if (check_next2(ls, expo)) /* exponent mark? */ 246 check_next2(ls, "-+"); /* optional exponent sign */ 247 else if (lisxdigit(ls->current) || ls->current == '.') /* '%x|%.' */ 248 save_and_next(ls); 249 else break; 250 } 251 if (lislalpha(ls->current)) /* is numeral touching a letter? */ 252 save_and_next(ls); /* force an error */ 253 save(ls, '\0'); 254 if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */ 255 lexerror(ls, "malformed number", TK_FLT); 256 if (ttisinteger(&obj)) { 257 seminfo->i = ivalue(&obj); 258 return TK_INT; 259 } 260 else { 261 lua_assert(ttisfloat(&obj)); 262 seminfo->r = fltvalue(&obj); 263 return TK_FLT; 264 } 265 } 266 267 #else /* _KERNEL */ 268 269 static int read_numeral (LexState *ls, SemInfo *seminfo) { 270 TValue obj; 271 int first = ls->current; 272 lua_assert(lisdigit(ls->current)); 273 save_and_next(ls); 274 if (first == '0') 275 check_next2(ls, "xX"); /* hexadecimal? */ 276 for (;;) { 277 if (lisxdigit(ls->current)) 278 save_and_next(ls); 279 else break; 280 } 281 save(ls, '\0'); 282 if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */ 283 lexerror(ls, "malformed number", TK_INT); 284 lua_assert(ttisinteger(&obj)); 285 seminfo->i = ivalue(&obj); 286 return TK_INT; 287 } 288 #endif /* _KERNEL */ 289 290 /* 291 ** read a sequence '[=*[' or ']=*]', leaving the last bracket. If 292 ** sequence is well formed, return its number of '='s + 2; otherwise, 293 ** return 1 if it is a single bracket (no '='s and no 2nd bracket); 294 ** otherwise (an unfinished '[==...') return 0. 295 */ 296 static size_t skip_sep (LexState *ls) { 297 size_t count = 0; 298 int s = ls->current; 299 lua_assert(s == '[' || s == ']'); 300 save_and_next(ls); 301 while (ls->current == '=') { 302 save_and_next(ls); 303 count++; 304 } 305 return (ls->current == s) ? count + 2 306 : (count == 0) ? 1 307 : 0; 308 } 309 310 311 static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) { 312 int line = ls->linenumber; /* initial line (for error message) */ 313 save_and_next(ls); /* skip 2nd '[' */ 314 if (currIsNewline(ls)) /* string starts with a newline? */ 315 inclinenumber(ls); /* skip it */ 316 for (;;) { 317 switch (ls->current) { 318 case EOZ: { /* error */ 319 const char *what = (seminfo ? "string" : "comment"); 320 const char *msg = luaO_pushfstring(ls->L, 321 "unfinished long %s (starting at line %d)", what, line); 322 lexerror(ls, msg, TK_EOS); 323 break; /* to avoid warnings */ 324 } 325 case ']': { 326 if (skip_sep(ls) == sep) { 327 save_and_next(ls); /* skip 2nd ']' */ 328 goto endloop; 329 } 330 break; 331 } 332 case '\n': case '\r': { 333 save(ls, '\n'); 334 inclinenumber(ls); 335 if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */ 336 break; 337 } 338 default: { 339 if (seminfo) save_and_next(ls); 340 else next(ls); 341 } 342 } 343 } endloop: 344 if (seminfo) 345 seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep, 346 luaZ_bufflen(ls->buff) - 2 * sep); 347 } 348 349 350 static void esccheck (LexState *ls, int c, const char *msg) { 351 if (!c) { 352 if (ls->current != EOZ) 353 save_and_next(ls); /* add current to buffer for error message */ 354 lexerror(ls, msg, TK_STRING); 355 } 356 } 357 358 359 static int gethexa (LexState *ls) { 360 save_and_next(ls); 361 esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected"); 362 return luaO_hexavalue(ls->current); 363 } 364 365 366 static int readhexaesc (LexState *ls) { 367 int r = gethexa(ls); 368 r = (r << 4) + gethexa(ls); 369 luaZ_buffremove(ls->buff, 2); /* remove saved chars from buffer */ 370 return r; 371 } 372 373 374 static unsigned long readutf8esc (LexState *ls) { 375 unsigned long r; 376 int i = 4; /* chars to be removed: '\', 'u', '{', and first digit */ 377 save_and_next(ls); /* skip 'u' */ 378 esccheck(ls, ls->current == '{', "missing '{'"); 379 r = gethexa(ls); /* must have at least one digit */ 380 while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) { 381 i++; 382 esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large"); 383 r = (r << 4) + luaO_hexavalue(ls->current); 384 } 385 esccheck(ls, ls->current == '}', "missing '}'"); 386 next(ls); /* skip '}' */ 387 luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */ 388 return r; 389 } 390 391 392 static void utf8esc (LexState *ls) { 393 char buff[UTF8BUFFSZ]; 394 int n = luaO_utf8esc(buff, readutf8esc(ls)); 395 for (; n > 0; n--) /* add 'buff' to string */ 396 save(ls, buff[UTF8BUFFSZ - n]); 397 } 398 399 400 static int readdecesc (LexState *ls) { 401 int i; 402 int r = 0; /* result accumulator */ 403 for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */ 404 r = 10*r + ls->current - '0'; 405 save_and_next(ls); 406 } 407 esccheck(ls, r <= UCHAR_MAX, "decimal escape too large"); 408 luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */ 409 return r; 410 } 411 412 413 static void read_string (LexState *ls, int del, SemInfo *seminfo) { 414 save_and_next(ls); /* keep delimiter (for error messages) */ 415 while (ls->current != del) { 416 switch (ls->current) { 417 case EOZ: 418 lexerror(ls, "unfinished string", TK_EOS); 419 break; /* to avoid warnings */ 420 case '\n': 421 case '\r': 422 lexerror(ls, "unfinished string", TK_STRING); 423 break; /* to avoid warnings */ 424 case '\\': { /* escape sequences */ 425 int c; /* final character to be saved */ 426 save_and_next(ls); /* keep '\\' for error messages */ 427 switch (ls->current) { 428 case 'a': c = '\a'; goto read_save; 429 case 'b': c = '\b'; goto read_save; 430 case 'f': c = '\f'; goto read_save; 431 case 'n': c = '\n'; goto read_save; 432 case 'r': c = '\r'; goto read_save; 433 case 't': c = '\t'; goto read_save; 434 case 'v': c = '\v'; goto read_save; 435 case 'x': c = readhexaesc(ls); goto read_save; 436 case 'u': utf8esc(ls); goto no_save; 437 case '\n': case '\r': 438 inclinenumber(ls); c = '\n'; goto only_save; 439 case '\\': case '\"': case '\'': 440 c = ls->current; goto read_save; 441 case EOZ: goto no_save; /* will raise an error next loop */ 442 case 'z': { /* zap following span of spaces */ 443 luaZ_buffremove(ls->buff, 1); /* remove '\\' */ 444 next(ls); /* skip the 'z' */ 445 while (lisspace(ls->current)) { 446 if (currIsNewline(ls)) inclinenumber(ls); 447 else next(ls); 448 } 449 goto no_save; 450 } 451 default: { 452 esccheck(ls, lisdigit(ls->current), "invalid escape sequence"); 453 c = readdecesc(ls); /* digital escape '\ddd' */ 454 goto only_save; 455 } 456 } 457 read_save: 458 next(ls); 459 /* go through */ 460 only_save: 461 luaZ_buffremove(ls->buff, 1); /* remove '\\' */ 462 save(ls, c); 463 /* go through */ 464 no_save: break; 465 } 466 default: 467 save_and_next(ls); 468 } 469 } 470 save_and_next(ls); /* skip delimiter */ 471 seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1, 472 luaZ_bufflen(ls->buff) - 2); 473 } 474 475 476 static int llex (LexState *ls, SemInfo *seminfo) { 477 luaZ_resetbuffer(ls->buff); 478 for (;;) { 479 switch (ls->current) { 480 case '\n': case '\r': { /* line breaks */ 481 inclinenumber(ls); 482 break; 483 } 484 case ' ': case '\f': case '\t': case '\v': { /* spaces */ 485 next(ls); 486 break; 487 } 488 case '-': { /* '-' or '--' (comment) */ 489 next(ls); 490 if (ls->current != '-') return '-'; 491 /* else is a comment */ 492 next(ls); 493 if (ls->current == '[') { /* long comment? */ 494 size_t sep = skip_sep(ls); 495 luaZ_resetbuffer(ls->buff); /* 'skip_sep' may dirty the buffer */ 496 if (sep >= 2) { 497 read_long_string(ls, NULL, sep); /* skip long comment */ 498 luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */ 499 break; 500 } 501 } 502 /* else short comment */ 503 while (!currIsNewline(ls) && ls->current != EOZ) 504 next(ls); /* skip until end of line (or end of file) */ 505 break; 506 } 507 case '[': { /* long string or simply '[' */ 508 size_t sep = skip_sep(ls); 509 if (sep >= 2) { 510 read_long_string(ls, seminfo, sep); 511 return TK_STRING; 512 } 513 else if (sep == 0) /* '[=...' missing second bracket? */ 514 lexerror(ls, "invalid long string delimiter", TK_STRING); 515 return '['; 516 } 517 case '=': { 518 next(ls); 519 if (check_next1(ls, '=')) return TK_EQ; /* '==' */ 520 else return '='; 521 } 522 case '<': { 523 next(ls); 524 if (check_next1(ls, '=')) return TK_LE; /* '<=' */ 525 else if (check_next1(ls, '<')) return TK_SHL; /* '<<' */ 526 else return '<'; 527 } 528 case '>': { 529 next(ls); 530 if (check_next1(ls, '=')) return TK_GE; /* '>=' */ 531 else if (check_next1(ls, '>')) return TK_SHR; /* '>>' */ 532 else return '>'; 533 } 534 case '/': { 535 next(ls); 536 if (check_next1(ls, '/')) return TK_IDIV; /* '//' */ 537 else return '/'; 538 } 539 case '~': { 540 next(ls); 541 if (check_next1(ls, '=')) return TK_NE; /* '~=' */ 542 else return '~'; 543 } 544 case ':': { 545 next(ls); 546 if (check_next1(ls, ':')) return TK_DBCOLON; /* '::' */ 547 else return ':'; 548 } 549 case '"': case '\'': { /* short literal strings */ 550 read_string(ls, ls->current, seminfo); 551 return TK_STRING; 552 } 553 case '.': { /* '.', '..', '...', or number */ 554 save_and_next(ls); 555 if (check_next1(ls, '.')) { 556 if (check_next1(ls, '.')) 557 return TK_DOTS; /* '...' */ 558 else return TK_CONCAT; /* '..' */ 559 } 560 #ifndef _KERNEL 561 else if (!lisdigit(ls->current)) return '.'; 562 else return read_numeral(ls, seminfo); 563 #else /* _KERNEL */ 564 else return '.'; 565 #endif /* _KERNEL */ 566 } 567 case '0': case '1': case '2': case '3': case '4': 568 case '5': case '6': case '7': case '8': case '9': { 569 return read_numeral(ls, seminfo); 570 } 571 case EOZ: { 572 return TK_EOS; 573 } 574 default: { 575 if (lislalpha(ls->current)) { /* identifier or reserved word? */ 576 TString *ts; 577 do { 578 save_and_next(ls); 579 } while (lislalnum(ls->current)); 580 ts = luaX_newstring(ls, luaZ_buffer(ls->buff), 581 luaZ_bufflen(ls->buff)); 582 seminfo->ts = ts; 583 if (isreserved(ts)) /* reserved word? */ 584 return ts->extra - 1 + FIRST_RESERVED; 585 else { 586 return TK_NAME; 587 } 588 } 589 else { /* single-char tokens ('+', '*', '%', '{', '}', ...) */ 590 int c = ls->current; 591 next(ls); 592 return c; 593 } 594 } 595 } 596 } 597 } 598 599 600 void luaX_next (LexState *ls) { 601 ls->lastline = ls->linenumber; 602 if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */ 603 ls->t = ls->lookahead; /* use this one */ 604 ls->lookahead.token = TK_EOS; /* and discharge it */ 605 } 606 else 607 ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */ 608 } 609 610 611 int luaX_lookahead (LexState *ls) { 612 lua_assert(ls->lookahead.token == TK_EOS); 613 ls->lookahead.token = llex(ls, &ls->lookahead.seminfo); 614 return ls->lookahead.token; 615 } 616 617