xref: /minix3/external/mit/lua/dist/src/llex.c (revision 0a6a1f1d05b60e214de2f05a7310ddd1f0e590e7)
1 /*	$NetBSD: llex.c,v 1.6 2015/10/08 13:40:16 mbalmer Exp $	*/
2 
3 /*
4 ** Id: llex.c,v 2.93 2015/05/22 17:45:56 roberto Exp
5 ** Lexical Analyzer
6 ** See Copyright Notice in lua.h
7 */
8 
9 #define llex_c
10 #define LUA_CORE
11 
12 #include "lprefix.h"
13 
14 
15 #ifndef _KERNEL
16 #include <locale.h>
17 #include <string.h>
18 #endif
19 
20 #include "lua.h"
21 
22 #include "lctype.h"
23 #include "ldebug.h"
24 #include "ldo.h"
25 #include "lgc.h"
26 #include "llex.h"
27 #include "lobject.h"
28 #include "lparser.h"
29 #include "lstate.h"
30 #include "lstring.h"
31 #include "ltable.h"
32 #include "lzio.h"
33 
34 
35 
36 #define next(ls) (ls->current = zgetc(ls->z))
37 
38 
39 
40 #define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
41 
42 
43 /* ORDER RESERVED */
44 static const char *const luaX_tokens [] = {
45     "and", "break", "do", "else", "elseif",
46     "end", "false", "for", "function", "goto", "if",
47     "in", "local", "nil", "not", "or", "repeat",
48     "return", "then", "true", "until", "while",
49     "//", "..", "...", "==", ">=", "<=", "~=",
50     "<<", ">>", "::", "<eof>",
51     "<number>", "<integer>", "<name>", "<string>"
52 };
53 
54 
55 #define save_and_next(ls) (save(ls, ls->current), next(ls))
56 
57 
58 static l_noret lexerror (LexState *ls, const char *msg, int token);
59 
60 
save(LexState * ls,int c)61 static void save (LexState *ls, int c) {
62   Mbuffer *b = ls->buff;
63   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
64     size_t newsize;
65     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
66       lexerror(ls, "lexical element too long", 0);
67     newsize = luaZ_sizebuffer(b) * 2;
68     luaZ_resizebuffer(ls->L, b, newsize);
69   }
70   b->buffer[luaZ_bufflen(b)++] = cast(char, c);
71 }
72 
73 
luaX_init(lua_State * L)74 void luaX_init (lua_State *L) {
75   int i;
76   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
77   luaC_fix(L, obj2gco(e));  /* never collect this name */
78   for (i=0; i<NUM_RESERVED; i++) {
79     TString *ts = luaS_new(L, luaX_tokens[i]);
80     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
81     ts->extra = cast_byte(i+1);  /* reserved word */
82   }
83 }
84 
85 
luaX_token2str(LexState * ls,int token)86 const char *luaX_token2str (LexState *ls, int token) {
87   if (token < FIRST_RESERVED) {  /* single-byte symbols? */
88     lua_assert(token == cast_uchar(token));
89     return luaO_pushfstring(ls->L, "'%c'", token);
90   }
91   else {
92     const char *s = luaX_tokens[token - FIRST_RESERVED];
93     if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
94       return luaO_pushfstring(ls->L, "'%s'", s);
95     else  /* names, strings, and numerals */
96       return s;
97   }
98 }
99 
100 
txtToken(LexState * ls,int token)101 static const char *txtToken (LexState *ls, int token) {
102   switch (token) {
103     case TK_NAME: case TK_STRING:
104 #ifndef _KERNEL
105     case TK_FLT: case TK_INT:
106 #else
107     case TK_INT:
108 #endif
109       save(ls, '\0');
110       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
111     default:
112       return luaX_token2str(ls, token);
113   }
114 }
115 
116 
lexerror(LexState * ls,const char * msg,int token)117 static l_noret lexerror (LexState *ls, const char *msg, int token) {
118   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
119   if (token)
120     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
121   luaD_throw(ls->L, LUA_ERRSYNTAX);
122 }
123 
124 
luaX_syntaxerror(LexState * ls,const char * msg)125 l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
126   lexerror(ls, msg, ls->t.token);
127 }
128 
129 
130 /*
131 ** creates a new string and anchors it in scanner's table so that
132 ** it will not be collected until the end of the compilation
133 ** (by that time it should be anchored somewhere)
134 */
luaX_newstring(LexState * ls,const char * str,size_t l)135 TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
136   lua_State *L = ls->L;
137   TValue *o;  /* entry for 'str' */
138   TString *ts = luaS_newlstr(L, str, l);  /* create new string */
139   setsvalue2s(L, L->top++, ts);  /* temporarily anchor it in stack */
140   o = luaH_set(L, ls->h, L->top - 1);
141   if (ttisnil(o)) {  /* not in use yet? */
142     /* boolean value does not need GC barrier;
143        table has no metatable, so it does not need to invalidate cache */
144     setbvalue(o, 1);  /* t[string] = true */
145     luaC_checkGC(L);
146   }
147   else {  /* string already present */
148     ts = tsvalue(keyfromval(o));  /* re-use value previously stored */
149   }
150   L->top--;  /* remove string from stack */
151   return ts;
152 }
153 
154 
155 /*
156 ** increment line number and skips newline sequence (any of
157 ** \n, \r, \n\r, or \r\n)
158 */
inclinenumber(LexState * ls)159 static void inclinenumber (LexState *ls) {
160   int old = ls->current;
161   lua_assert(currIsNewline(ls));
162   next(ls);  /* skip '\n' or '\r' */
163   if (currIsNewline(ls) && ls->current != old)
164     next(ls);  /* skip '\n\r' or '\r\n' */
165   if (++ls->linenumber >= MAX_INT)
166     lexerror(ls, "chunk has too many lines", 0);
167 }
168 
169 
luaX_setinput(lua_State * L,LexState * ls,ZIO * z,TString * source,int firstchar)170 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
171                     int firstchar) {
172   ls->t.token = 0;
173   ls->decpoint = '.';
174   ls->L = L;
175   ls->current = firstchar;
176   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
177   ls->z = z;
178   ls->fs = NULL;
179   ls->linenumber = 1;
180   ls->lastline = 1;
181   ls->source = source;
182   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
183   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
184 }
185 
186 
187 
188 /*
189 ** =======================================================
190 ** LEXICAL ANALYZER
191 ** =======================================================
192 */
193 
194 
check_next1(LexState * ls,int c)195 static int check_next1 (LexState *ls, int c) {
196   if (ls->current == c) {
197     next(ls);
198     return 1;
199   }
200   else return 0;
201 }
202 
203 
204 /*
205 ** Check whether current char is in set 'set' (with two chars) and
206 ** saves it
207 */
check_next2(LexState * ls,const char * set)208 static int check_next2 (LexState *ls, const char *set) {
209   lua_assert(set[2] == '\0');
210   if (ls->current == set[0] || ls->current == set[1]) {
211     save_and_next(ls);
212     return 1;
213   }
214   else return 0;
215 }
216 
217 
218 #ifndef _KERNEL
219 /*
220 ** change all characters 'from' in buffer to 'to'
221 */
buffreplace(LexState * ls,char from,char to)222 static void buffreplace (LexState *ls, char from, char to) {
223   if (from != to) {
224     size_t n = luaZ_bufflen(ls->buff);
225     char *p = luaZ_buffer(ls->buff);
226     while (n--)
227       if (p[n] == from) p[n] = to;
228   }
229 }
230 #endif
231 
232 #define buff2num(b,o)	(luaO_str2num(luaZ_buffer(b), o) != 0)
233 
234 #ifndef _KERNEL
235 /*
236 ** in case of format error, try to change decimal point separator to
237 ** the one defined in the current locale and check again
238 */
trydecpoint(LexState * ls,TValue * o)239 static void trydecpoint (LexState *ls, TValue *o) {
240   char old = ls->decpoint;
241   ls->decpoint = lua_getlocaledecpoint();
242   buffreplace(ls, old, ls->decpoint);  /* try new decimal separator */
243   if (!buff2num(ls->buff, o)) {
244     /* format error with correct decimal point: no more options */
245     buffreplace(ls, ls->decpoint, '.');  /* undo change (for error message) */
246     lexerror(ls, "malformed number", TK_FLT);
247   }
248 }
249 
250 
251 /* LUA_NUMBER */
252 /*
253 ** this function is quite liberal in what it accepts, as 'luaO_str2num'
254 ** will reject ill-formed numerals.
255 */
read_numeral(LexState * ls,SemInfo * seminfo)256 static int read_numeral (LexState *ls, SemInfo *seminfo) {
257   TValue obj;
258   const char *expo = "Ee";
259   int first = ls->current;
260   lua_assert(lisdigit(ls->current));
261   save_and_next(ls);
262   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
263     expo = "Pp";
264   for (;;) {
265     if (check_next2(ls, expo))  /* exponent part? */
266       check_next2(ls, "-+");  /* optional exponent sign */
267     if (lisxdigit(ls->current))
268       save_and_next(ls);
269     else if (ls->current == '.')
270       save_and_next(ls);
271     else break;
272   }
273   save(ls, '\0');
274   buffreplace(ls, '.', ls->decpoint);  /* follow locale for decimal point */
275   if (!buff2num(ls->buff, &obj))  /* format error? */
276     trydecpoint(ls, &obj); /* try to update decimal point separator */
277   if (ttisinteger(&obj)) {
278     seminfo->i = ivalue(&obj);
279     return TK_INT;
280   }
281   else {
282     lua_assert(ttisfloat(&obj));
283     seminfo->r = fltvalue(&obj);
284     return TK_FLT;
285   }
286 }
287 
288 #else /* _KERNEL */
289 
read_numeral(LexState * ls,SemInfo * seminfo)290 static int read_numeral (LexState *ls, SemInfo *seminfo) {
291   TValue obj;
292   int first = ls->current;
293   lua_assert(lisdigit(ls->current));
294   save_and_next(ls);
295   if (first == '0')
296     check_next2(ls, "xX");  /* hexadecimal? */
297   for (;;) {
298     if (lisxdigit(ls->current))
299       save_and_next(ls);
300     else break;
301   }
302   save(ls, '\0');
303   if (!buff2num(ls->buff, &obj))  /* format error? */
304     lexerror(ls, "malformed number", TK_INT);
305   lua_assert(ttisinteger(&obj));
306   seminfo->i = ivalue(&obj);
307   return TK_INT;
308 }
309 #endif
310 
311 /*
312 ** skip a sequence '[=*[' or ']=*]'; if sequence is wellformed, return
313 ** its number of '='s; otherwise, return a negative number (-1 iff there
314 ** are no '='s after initial bracket)
315 */
skip_sep(LexState * ls)316 static int skip_sep (LexState *ls) {
317   int count = 0;
318   int s = ls->current;
319   lua_assert(s == '[' || s == ']');
320   save_and_next(ls);
321   while (ls->current == '=') {
322     save_and_next(ls);
323     count++;
324   }
325   return (ls->current == s) ? count : (-count) - 1;
326 }
327 
328 
read_long_string(LexState * ls,SemInfo * seminfo,int sep)329 static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
330   int line = ls->linenumber;  /* initial line (for error message) */
331   save_and_next(ls);  /* skip 2nd '[' */
332   if (currIsNewline(ls))  /* string starts with a newline? */
333     inclinenumber(ls);  /* skip it */
334   for (;;) {
335     switch (ls->current) {
336       case EOZ: {  /* error */
337         const char *what = (seminfo ? "string" : "comment");
338         const char *msg = luaO_pushfstring(ls->L,
339                      "unfinished long %s (starting at line %d)", what, line);
340         lexerror(ls, msg, TK_EOS);
341         break;  /* to avoid warnings */
342       }
343       case ']': {
344         if (skip_sep(ls) == sep) {
345           save_and_next(ls);  /* skip 2nd ']' */
346           goto endloop;
347         }
348         break;
349       }
350       case '\n': case '\r': {
351         save(ls, '\n');
352         inclinenumber(ls);
353         if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
354         break;
355       }
356       default: {
357         if (seminfo) save_and_next(ls);
358         else next(ls);
359       }
360     }
361   } endloop:
362   if (seminfo)
363     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep),
364                                      luaZ_bufflen(ls->buff) - 2*(2 + sep));
365 }
366 
367 
esccheck(LexState * ls,int c,const char * msg)368 static void esccheck (LexState *ls, int c, const char *msg) {
369   if (!c) {
370     if (ls->current != EOZ)
371       save_and_next(ls);  /* add current to buffer for error message */
372     lexerror(ls, msg, TK_STRING);
373   }
374 }
375 
376 
gethexa(LexState * ls)377 static int gethexa (LexState *ls) {
378   save_and_next(ls);
379   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
380   return luaO_hexavalue(ls->current);
381 }
382 
383 
readhexaesc(LexState * ls)384 static int readhexaesc (LexState *ls) {
385   int r = gethexa(ls);
386   r = (r << 4) + gethexa(ls);
387   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
388   return r;
389 }
390 
391 
readutf8esc(LexState * ls)392 static unsigned long readutf8esc (LexState *ls) {
393   unsigned long r;
394   int i = 4;  /* chars to be removed: '\', 'u', '{', and first digit */
395   save_and_next(ls);  /* skip 'u' */
396   esccheck(ls, ls->current == '{', "missing '{'");
397   r = gethexa(ls);  /* must have at least one digit */
398   while ((save_and_next(ls), lisxdigit(ls->current))) {
399     i++;
400     r = (r << 4) + luaO_hexavalue(ls->current);
401     esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
402   }
403   esccheck(ls, ls->current == '}', "missing '}'");
404   next(ls);  /* skip '}' */
405   luaZ_buffremove(ls->buff, i);  /* remove saved chars from buffer */
406   return r;
407 }
408 
409 
utf8esc(LexState * ls)410 static void utf8esc (LexState *ls) {
411   char buff[UTF8BUFFSZ];
412   int n = luaO_utf8esc(buff, readutf8esc(ls));
413   for (; n > 0; n--)  /* add 'buff' to string */
414     save(ls, buff[UTF8BUFFSZ - n]);
415 }
416 
417 
readdecesc(LexState * ls)418 static int readdecesc (LexState *ls) {
419   int i;
420   int r = 0;  /* result accumulator */
421   for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
422     r = 10*r + ls->current - '0';
423     save_and_next(ls);
424   }
425   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
426   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
427   return r;
428 }
429 
430 
read_string(LexState * ls,int del,SemInfo * seminfo)431 static void read_string (LexState *ls, int del, SemInfo *seminfo) {
432   save_and_next(ls);  /* keep delimiter (for error messages) */
433   while (ls->current != del) {
434     switch (ls->current) {
435       case EOZ:
436         lexerror(ls, "unfinished string", TK_EOS);
437         break;  /* to avoid warnings */
438       case '\n':
439       case '\r':
440         lexerror(ls, "unfinished string", TK_STRING);
441         break;  /* to avoid warnings */
442       case '\\': {  /* escape sequences */
443         int c;  /* final character to be saved */
444         save_and_next(ls);  /* keep '\\' for error messages */
445         switch (ls->current) {
446           case 'a': c = '\a'; goto read_save;
447           case 'b': c = '\b'; goto read_save;
448           case 'f': c = '\f'; goto read_save;
449           case 'n': c = '\n'; goto read_save;
450           case 'r': c = '\r'; goto read_save;
451           case 't': c = '\t'; goto read_save;
452           case 'v': c = '\v'; goto read_save;
453           case 'x': c = readhexaesc(ls); goto read_save;
454           case 'u': utf8esc(ls);  goto no_save;
455           case '\n': case '\r':
456             inclinenumber(ls); c = '\n'; goto only_save;
457           case '\\': case '\"': case '\'':
458             c = ls->current; goto read_save;
459           case EOZ: goto no_save;  /* will raise an error next loop */
460           case 'z': {  /* zap following span of spaces */
461             luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
462             next(ls);  /* skip the 'z' */
463             while (lisspace(ls->current)) {
464               if (currIsNewline(ls)) inclinenumber(ls);
465               else next(ls);
466             }
467             goto no_save;
468           }
469           default: {
470             esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
471             c = readdecesc(ls);  /* digital escape '\ddd' */
472             goto only_save;
473           }
474         }
475        read_save:
476          next(ls);
477          /* go through */
478        only_save:
479          luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
480          save(ls, c);
481          /* go through */
482        no_save: break;
483       }
484       default:
485         save_and_next(ls);
486     }
487   }
488   save_and_next(ls);  /* skip delimiter */
489   seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
490                                    luaZ_bufflen(ls->buff) - 2);
491 }
492 
493 
llex(LexState * ls,SemInfo * seminfo)494 static int llex (LexState *ls, SemInfo *seminfo) {
495   luaZ_resetbuffer(ls->buff);
496   for (;;) {
497     switch (ls->current) {
498       case '\n': case '\r': {  /* line breaks */
499         inclinenumber(ls);
500         break;
501       }
502       case ' ': case '\f': case '\t': case '\v': {  /* spaces */
503         next(ls);
504         break;
505       }
506       case '-': {  /* '-' or '--' (comment) */
507         next(ls);
508         if (ls->current != '-') return '-';
509         /* else is a comment */
510         next(ls);
511         if (ls->current == '[') {  /* long comment? */
512           int sep = skip_sep(ls);
513           luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
514           if (sep >= 0) {
515             read_long_string(ls, NULL, sep);  /* skip long comment */
516             luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
517             break;
518           }
519         }
520         /* else short comment */
521         while (!currIsNewline(ls) && ls->current != EOZ)
522           next(ls);  /* skip until end of line (or end of file) */
523         break;
524       }
525       case '[': {  /* long string or simply '[' */
526         int sep = skip_sep(ls);
527         if (sep >= 0) {
528           read_long_string(ls, seminfo, sep);
529           return TK_STRING;
530         }
531         else if (sep != -1)  /* '[=...' missing second bracket */
532           lexerror(ls, "invalid long string delimiter", TK_STRING);
533         return '[';
534       }
535       case '=': {
536         next(ls);
537         if (check_next1(ls, '=')) return TK_EQ;
538         else return '=';
539       }
540       case '<': {
541         next(ls);
542         if (check_next1(ls, '=')) return TK_LE;
543         else if (check_next1(ls, '<')) return TK_SHL;
544         else return '<';
545       }
546       case '>': {
547         next(ls);
548         if (check_next1(ls, '=')) return TK_GE;
549         else if (check_next1(ls, '>')) return TK_SHR;
550         else return '>';
551       }
552       case '/': {
553         next(ls);
554         if (check_next1(ls, '/')) return TK_IDIV;
555         else return '/';
556       }
557       case '~': {
558         next(ls);
559         if (check_next1(ls, '=')) return TK_NE;
560         else return '~';
561       }
562       case ':': {
563         next(ls);
564         if (check_next1(ls, ':')) return TK_DBCOLON;
565         else return ':';
566       }
567       case '"': case '\'': {  /* short literal strings */
568         read_string(ls, ls->current, seminfo);
569         return TK_STRING;
570       }
571       case '.': {  /* '.', '..', '...', or number */
572         save_and_next(ls);
573         if (check_next1(ls, '.')) {
574           if (check_next1(ls, '.'))
575             return TK_DOTS;   /* '...' */
576           else return TK_CONCAT;   /* '..' */
577         }
578 #ifndef _KERNEL
579         else if (!lisdigit(ls->current)) return '.';
580         else return read_numeral(ls, seminfo);
581 #else /* _KERNEL */
582         else return '.';
583 #endif
584       }
585       case '0': case '1': case '2': case '3': case '4':
586       case '5': case '6': case '7': case '8': case '9': {
587         return read_numeral(ls, seminfo);
588       }
589       case EOZ: {
590         return TK_EOS;
591       }
592       default: {
593         if (lislalpha(ls->current)) {  /* identifier or reserved word? */
594           TString *ts;
595           do {
596             save_and_next(ls);
597           } while (lislalnum(ls->current));
598           ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
599                                   luaZ_bufflen(ls->buff));
600           seminfo->ts = ts;
601           if (isreserved(ts))  /* reserved word? */
602             return ts->extra - 1 + FIRST_RESERVED;
603           else {
604             return TK_NAME;
605           }
606         }
607         else {  /* single-char tokens (+ - / ...) */
608           int c = ls->current;
609           next(ls);
610           return c;
611         }
612       }
613     }
614   }
615 }
616 
617 
luaX_next(LexState * ls)618 void luaX_next (LexState *ls) {
619   ls->lastline = ls->linenumber;
620   if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
621     ls->t = ls->lookahead;  /* use this one */
622     ls->lookahead.token = TK_EOS;  /* and discharge it */
623   }
624   else
625     ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
626 }
627 
628 
luaX_lookahead(LexState * ls)629 int luaX_lookahead (LexState *ls) {
630   lua_assert(ls->lookahead.token == TK_EOS);
631   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
632   return ls->lookahead.token;
633 }
634 
635