xref: /netbsd-src/external/mit/lua/dist/src/llex.c (revision 200d779b75dbeafa7bc01fd0f60bc61185f6967b)
1 /*	$NetBSD: llex.c,v 1.4 2015/02/19 04:46:22 lneto Exp $	*/
2 
3 /*
4 ** Id: llex.c,v 2.89 2014/11/14 16:06:09 roberto Exp
5 ** Lexical Analyzer
6 ** See Copyright Notice in lua.h
7 */
8 
9 #define llex_c
10 #define LUA_CORE
11 
12 #include "lprefix.h"
13 
14 
15 #ifndef _KERNEL
16 #include <locale.h>
17 #include <string.h>
18 #endif
19 
20 #include "lua.h"
21 
22 #include "lctype.h"
23 #include "ldo.h"
24 #include "lgc.h"
25 #include "llex.h"
26 #include "lobject.h"
27 #include "lparser.h"
28 #include "lstate.h"
29 #include "lstring.h"
30 #include "ltable.h"
31 #include "lzio.h"
32 
33 
34 
35 #define next(ls) (ls->current = zgetc(ls->z))
36 
37 
38 
39 #define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
40 
41 
42 /* ORDER RESERVED */
43 static const char *const luaX_tokens [] = {
44     "and", "break", "do", "else", "elseif",
45     "end", "false", "for", "function", "goto", "if",
46     "in", "local", "nil", "not", "or", "repeat",
47     "return", "then", "true", "until", "while",
48     "//", "..", "...", "==", ">=", "<=", "~=",
49     "<<", ">>", "::", "<eof>",
50     "<number>", "<integer>", "<name>", "<string>"
51 };
52 
53 
54 #define save_and_next(ls) (save(ls, ls->current), next(ls))
55 
56 
57 static l_noret lexerror (LexState *ls, const char *msg, int token);
58 
59 
60 static void save (LexState *ls, int c) {
61   Mbuffer *b = ls->buff;
62   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
63     size_t newsize;
64     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
65       lexerror(ls, "lexical element too long", 0);
66     newsize = luaZ_sizebuffer(b) * 2;
67     luaZ_resizebuffer(ls->L, b, newsize);
68   }
69   b->buffer[luaZ_bufflen(b)++] = cast(char, c);
70 }
71 
72 
73 void luaX_init (lua_State *L) {
74   int i;
75   TString *e = luaS_new(L, LUA_ENV);  /* create env name */
76   luaC_fix(L, obj2gco(e));  /* never collect this name */
77   for (i=0; i<NUM_RESERVED; i++) {
78     TString *ts = luaS_new(L, luaX_tokens[i]);
79     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
80     ts->extra = cast_byte(i+1);  /* reserved word */
81   }
82 }
83 
84 
85 const char *luaX_token2str (LexState *ls, int token) {
86   if (token < FIRST_RESERVED) {  /* single-byte symbols? */
87     lua_assert(token == cast_uchar(token));
88     return luaO_pushfstring(ls->L, "'%c'", token);
89   }
90   else {
91     const char *s = luaX_tokens[token - FIRST_RESERVED];
92     if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
93       return luaO_pushfstring(ls->L, "'%s'", s);
94     else  /* names, strings, and numerals */
95       return s;
96   }
97 }
98 
99 
100 static const char *txtToken (LexState *ls, int token) {
101   switch (token) {
102     case TK_NAME: case TK_STRING:
103 #ifndef _KERNEL
104     case TK_FLT: case TK_INT:
105 #else
106     case TK_INT:
107 #endif
108       save(ls, '\0');
109       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
110     default:
111       return luaX_token2str(ls, token);
112   }
113 }
114 
115 
116 static l_noret lexerror (LexState *ls, const char *msg, int token) {
117   char buff[LUA_IDSIZE];
118   luaO_chunkid(buff, getstr(ls->source), LUA_IDSIZE);
119   msg = luaO_pushfstring(ls->L, "%s:%d: %s", buff, ls->linenumber, msg);
120   if (token)
121     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
122   luaD_throw(ls->L, LUA_ERRSYNTAX);
123 }
124 
125 
126 l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
127   lexerror(ls, msg, ls->t.token);
128 }
129 
130 
131 /*
132 ** creates a new string and anchors it in scanner's table so that
133 ** it will not be collected until the end of the compilation
134 ** (by that time it should be anchored somewhere)
135 */
136 TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
137   lua_State *L = ls->L;
138   TValue *o;  /* entry for 'str' */
139   TString *ts = luaS_newlstr(L, str, l);  /* create new string */
140   setsvalue2s(L, L->top++, ts);  /* temporarily anchor it in stack */
141   o = luaH_set(L, ls->h, L->top - 1);
142   if (ttisnil(o)) {  /* not in use yet? */
143     /* boolean value does not need GC barrier;
144        table has no metatable, so it does not need to invalidate cache */
145     setbvalue(o, 1);  /* t[string] = true */
146     luaC_checkGC(L);
147   }
148   else {  /* string already present */
149     ts = tsvalue(keyfromval(o));  /* re-use value previously stored */
150   }
151   L->top--;  /* remove string from stack */
152   return ts;
153 }
154 
155 
156 /*
157 ** increment line number and skips newline sequence (any of
158 ** \n, \r, \n\r, or \r\n)
159 */
160 static void inclinenumber (LexState *ls) {
161   int old = ls->current;
162   lua_assert(currIsNewline(ls));
163   next(ls);  /* skip '\n' or '\r' */
164   if (currIsNewline(ls) && ls->current != old)
165     next(ls);  /* skip '\n\r' or '\r\n' */
166   if (++ls->linenumber >= MAX_INT)
167     lexerror(ls, "chunk has too many lines", 0);
168 }
169 
170 
171 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
172                     int firstchar) {
173   ls->t.token = 0;
174   ls->decpoint = '.';
175   ls->L = L;
176   ls->current = firstchar;
177   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
178   ls->z = z;
179   ls->fs = NULL;
180   ls->linenumber = 1;
181   ls->lastline = 1;
182   ls->source = source;
183   ls->envn = luaS_new(L, LUA_ENV);  /* get env name */
184   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
185 }
186 
187 
188 
189 /*
190 ** =======================================================
191 ** LEXICAL ANALYZER
192 ** =======================================================
193 */
194 
195 
196 static int check_next1 (LexState *ls, int c) {
197   if (ls->current == c) {
198     next(ls);
199     return 1;
200   }
201   else return 0;
202 }
203 
204 
205 /*
206 ** Check whether current char is in set 'set' (with two chars) and
207 ** saves it
208 */
209 static int check_next2 (LexState *ls, const char *set) {
210   lua_assert(set[2] == '\0');
211   if (ls->current == set[0] || ls->current == set[1]) {
212     save_and_next(ls);
213     return 1;
214   }
215   else return 0;
216 }
217 
218 
219 #ifndef _KERNEL
220 /*
221 ** change all characters 'from' in buffer to 'to'
222 */
223 static void buffreplace (LexState *ls, char from, char to) {
224   if (from != to) {
225     size_t n = luaZ_bufflen(ls->buff);
226     char *p = luaZ_buffer(ls->buff);
227     while (n--)
228       if (p[n] == from) p[n] = to;
229   }
230 }
231 
232 
233 #if !defined(l_getlocaledecpoint)
234 #define l_getlocaledecpoint()	(localeconv()->decimal_point[0])
235 #endif
236 #endif
237 
238 
239 #define buff2num(b,o)	(luaO_str2num(luaZ_buffer(b), o) != 0)
240 
241 #ifndef _KERNEL
242 /*
243 ** in case of format error, try to change decimal point separator to
244 ** the one defined in the current locale and check again
245 */
246 static void trydecpoint (LexState *ls, TValue *o) {
247   char old = ls->decpoint;
248   ls->decpoint = l_getlocaledecpoint();
249   buffreplace(ls, old, ls->decpoint);  /* try new decimal separator */
250   if (!buff2num(ls->buff, o)) {
251     /* format error with correct decimal point: no more options */
252     buffreplace(ls, ls->decpoint, '.');  /* undo change (for error message) */
253     lexerror(ls, "malformed number", TK_FLT);
254   }
255 }
256 
257 
258 /* LUA_NUMBER */
259 /*
260 ** this function is quite liberal in what it accepts, as 'luaO_str2num'
261 ** will reject ill-formed numerals.
262 */
263 static int read_numeral (LexState *ls, SemInfo *seminfo) {
264   TValue obj;
265   const char *expo = "Ee";
266   int first = ls->current;
267   lua_assert(lisdigit(ls->current));
268   save_and_next(ls);
269   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
270     expo = "Pp";
271   for (;;) {
272     if (check_next2(ls, expo))  /* exponent part? */
273       check_next2(ls, "-+");  /* optional exponent sign */
274     if (lisxdigit(ls->current))
275       save_and_next(ls);
276     else if (ls->current == '.')
277       save_and_next(ls);
278     else break;
279   }
280   save(ls, '\0');
281   buffreplace(ls, '.', ls->decpoint);  /* follow locale for decimal point */
282   if (!buff2num(ls->buff, &obj))  /* format error? */
283     trydecpoint(ls, &obj); /* try to update decimal point separator */
284   if (ttisinteger(&obj)) {
285     seminfo->i = ivalue(&obj);
286     return TK_INT;
287   }
288   else {
289     lua_assert(ttisfloat(&obj));
290     seminfo->r = fltvalue(&obj);
291     return TK_FLT;
292   }
293 }
294 
295 #else /* _KERNEL */
296 
297 static int read_numeral (LexState *ls, SemInfo *seminfo) {
298   TValue obj;
299   int first = ls->current;
300   lua_assert(lisdigit(ls->current));
301   save_and_next(ls);
302   if (first == '0')
303     check_next2(ls, "xX");  /* hexadecimal? */
304   for (;;) {
305     if (lisxdigit(ls->current))
306       save_and_next(ls);
307     else break;
308   }
309   save(ls, '\0');
310   if (!buff2num(ls->buff, &obj))  /* format error? */
311     lexerror(ls, "malformed number", TK_INT);
312   lua_assert(ttisinteger(&obj));
313   seminfo->i = ivalue(&obj);
314   return TK_INT;
315 }
316 #endif
317 
318 /*
319 ** skip a sequence '[=*[' or ']=*]' and return its number of '='s or
320 ** -1 if sequence is malformed
321 */
322 static int skip_sep (LexState *ls) {
323   int count = 0;
324   int s = ls->current;
325   lua_assert(s == '[' || s == ']');
326   save_and_next(ls);
327   while (ls->current == '=') {
328     save_and_next(ls);
329     count++;
330   }
331   return (ls->current == s) ? count : (-count) - 1;
332 }
333 
334 
335 static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
336   int line = ls->linenumber;  /* initial line (for error message) */
337   save_and_next(ls);  /* skip 2nd '[' */
338   if (currIsNewline(ls))  /* string starts with a newline? */
339     inclinenumber(ls);  /* skip it */
340   for (;;) {
341     switch (ls->current) {
342       case EOZ: {  /* error */
343         const char *what = (seminfo ? "string" : "comment");
344         const char *msg = luaO_pushfstring(ls->L,
345                      "unfinished long %s (starting at line %d)", what, line);
346         lexerror(ls, msg, TK_EOS);
347         break;  /* to avoid warnings */
348       }
349       case ']': {
350         if (skip_sep(ls) == sep) {
351           save_and_next(ls);  /* skip 2nd ']' */
352           goto endloop;
353         }
354         break;
355       }
356       case '\n': case '\r': {
357         save(ls, '\n');
358         inclinenumber(ls);
359         if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
360         break;
361       }
362       default: {
363         if (seminfo) save_and_next(ls);
364         else next(ls);
365       }
366     }
367   } endloop:
368   if (seminfo)
369     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep),
370                                      luaZ_bufflen(ls->buff) - 2*(2 + sep));
371 }
372 
373 
374 static void esccheck (LexState *ls, int c, const char *msg) {
375   if (!c) {
376     if (ls->current != EOZ)
377       save_and_next(ls);  /* add current to buffer for error message */
378     lexerror(ls, msg, TK_STRING);
379   }
380 }
381 
382 
383 static int gethexa (LexState *ls) {
384   save_and_next(ls);
385   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
386   return luaO_hexavalue(ls->current);
387 }
388 
389 
390 static int readhexaesc (LexState *ls) {
391   int r = gethexa(ls);
392   r = (r << 4) + gethexa(ls);
393   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
394   return r;
395 }
396 
397 
398 static unsigned long readutf8esc (LexState *ls) {
399   unsigned long r;
400   int i = 4;  /* chars to be removed: '\', 'u', '{', and first digit */
401   save_and_next(ls);  /* skip 'u' */
402   esccheck(ls, ls->current == '{', "missing '{'");
403   r = gethexa(ls);  /* must have at least one digit */
404   while ((save_and_next(ls), lisxdigit(ls->current))) {
405     i++;
406     r = (r << 4) + luaO_hexavalue(ls->current);
407     esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
408   }
409   esccheck(ls, ls->current == '}', "missing '}'");
410   next(ls);  /* skip '}' */
411   luaZ_buffremove(ls->buff, i);  /* remove saved chars from buffer */
412   return r;
413 }
414 
415 
416 static void utf8esc (LexState *ls) {
417   char buff[UTF8BUFFSZ];
418   int n = luaO_utf8esc(buff, readutf8esc(ls));
419   for (; n > 0; n--)  /* add 'buff' to string */
420     save(ls, buff[UTF8BUFFSZ - n]);
421 }
422 
423 
424 static int readdecesc (LexState *ls) {
425   int i;
426   int r = 0;  /* result accumulator */
427   for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
428     r = 10*r + ls->current - '0';
429     save_and_next(ls);
430   }
431   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
432   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
433   return r;
434 }
435 
436 
437 static void read_string (LexState *ls, int del, SemInfo *seminfo) {
438   save_and_next(ls);  /* keep delimiter (for error messages) */
439   while (ls->current != del) {
440     switch (ls->current) {
441       case EOZ:
442         lexerror(ls, "unfinished string", TK_EOS);
443         break;  /* to avoid warnings */
444       case '\n':
445       case '\r':
446         lexerror(ls, "unfinished string", TK_STRING);
447         break;  /* to avoid warnings */
448       case '\\': {  /* escape sequences */
449         int c;  /* final character to be saved */
450         save_and_next(ls);  /* keep '\\' for error messages */
451         switch (ls->current) {
452           case 'a': c = '\a'; goto read_save;
453           case 'b': c = '\b'; goto read_save;
454           case 'f': c = '\f'; goto read_save;
455           case 'n': c = '\n'; goto read_save;
456           case 'r': c = '\r'; goto read_save;
457           case 't': c = '\t'; goto read_save;
458           case 'v': c = '\v'; goto read_save;
459           case 'x': c = readhexaesc(ls); goto read_save;
460           case 'u': utf8esc(ls);  goto no_save;
461           case '\n': case '\r':
462             inclinenumber(ls); c = '\n'; goto only_save;
463           case '\\': case '\"': case '\'':
464             c = ls->current; goto read_save;
465           case EOZ: goto no_save;  /* will raise an error next loop */
466           case 'z': {  /* zap following span of spaces */
467             luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
468             next(ls);  /* skip the 'z' */
469             while (lisspace(ls->current)) {
470               if (currIsNewline(ls)) inclinenumber(ls);
471               else next(ls);
472             }
473             goto no_save;
474           }
475           default: {
476             esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
477             c = readdecesc(ls);  /* digital escape '\ddd' */
478             goto only_save;
479           }
480         }
481        read_save:
482          next(ls);
483          /* go through */
484        only_save:
485          luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
486          save(ls, c);
487          /* go through */
488        no_save: break;
489       }
490       default:
491         save_and_next(ls);
492     }
493   }
494   save_and_next(ls);  /* skip delimiter */
495   seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
496                                    luaZ_bufflen(ls->buff) - 2);
497 }
498 
499 
500 static int llex (LexState *ls, SemInfo *seminfo) {
501   luaZ_resetbuffer(ls->buff);
502   for (;;) {
503     switch (ls->current) {
504       case '\n': case '\r': {  /* line breaks */
505         inclinenumber(ls);
506         break;
507       }
508       case ' ': case '\f': case '\t': case '\v': {  /* spaces */
509         next(ls);
510         break;
511       }
512       case '-': {  /* '-' or '--' (comment) */
513         next(ls);
514         if (ls->current != '-') return '-';
515         /* else is a comment */
516         next(ls);
517         if (ls->current == '[') {  /* long comment? */
518           int sep = skip_sep(ls);
519           luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
520           if (sep >= 0) {
521             read_long_string(ls, NULL, sep);  /* skip long comment */
522             luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
523             break;
524           }
525         }
526         /* else short comment */
527         while (!currIsNewline(ls) && ls->current != EOZ)
528           next(ls);  /* skip until end of line (or end of file) */
529         break;
530       }
531       case '[': {  /* long string or simply '[' */
532         int sep = skip_sep(ls);
533         if (sep >= 0) {
534           read_long_string(ls, seminfo, sep);
535           return TK_STRING;
536         }
537         else if (sep == -1) return '[';
538         else lexerror(ls, "invalid long string delimiter", TK_STRING);
539       }
540       case '=': {
541         next(ls);
542         if (check_next1(ls, '=')) return TK_EQ;
543         else return '=';
544       }
545       case '<': {
546         next(ls);
547         if (check_next1(ls, '=')) return TK_LE;
548         else if (check_next1(ls, '<')) return TK_SHL;
549         else return '<';
550       }
551       case '>': {
552         next(ls);
553         if (check_next1(ls, '=')) return TK_GE;
554         else if (check_next1(ls, '>')) return TK_SHR;
555         else return '>';
556       }
557       case '/': {
558         next(ls);
559         if (check_next1(ls, '/')) return TK_IDIV;
560         else return '/';
561       }
562       case '~': {
563         next(ls);
564         if (check_next1(ls, '=')) return TK_NE;
565         else return '~';
566       }
567       case ':': {
568         next(ls);
569         if (check_next1(ls, ':')) return TK_DBCOLON;
570         else return ':';
571       }
572       case '"': case '\'': {  /* short literal strings */
573         read_string(ls, ls->current, seminfo);
574         return TK_STRING;
575       }
576       case '.': {  /* '.', '..', '...', or number */
577         save_and_next(ls);
578         if (check_next1(ls, '.')) {
579           if (check_next1(ls, '.'))
580             return TK_DOTS;   /* '...' */
581           else return TK_CONCAT;   /* '..' */
582         }
583 #ifndef _KERNEL
584         else if (!lisdigit(ls->current)) return '.';
585         else return read_numeral(ls, seminfo);
586 #else /* _KERNEL */
587         else return '.';
588 #endif
589       }
590       case '0': case '1': case '2': case '3': case '4':
591       case '5': case '6': case '7': case '8': case '9': {
592         return read_numeral(ls, seminfo);
593       }
594       case EOZ: {
595         return TK_EOS;
596       }
597       default: {
598         if (lislalpha(ls->current)) {  /* identifier or reserved word? */
599           TString *ts;
600           do {
601             save_and_next(ls);
602           } while (lislalnum(ls->current));
603           ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
604                                   luaZ_bufflen(ls->buff));
605           seminfo->ts = ts;
606           if (isreserved(ts))  /* reserved word? */
607             return ts->extra - 1 + FIRST_RESERVED;
608           else {
609             return TK_NAME;
610           }
611         }
612         else {  /* single-char tokens (+ - / ...) */
613           int c = ls->current;
614           next(ls);
615           return c;
616         }
617       }
618     }
619   }
620 }
621 
622 
623 void luaX_next (LexState *ls) {
624   ls->lastline = ls->linenumber;
625   if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
626     ls->t = ls->lookahead;  /* use this one */
627     ls->lookahead.token = TK_EOS;  /* and discharge it */
628   }
629   else
630     ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
631 }
632 
633 
634 int luaX_lookahead (LexState *ls) {
635   lua_assert(ls->lookahead.token == TK_EOS);
636   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
637   return ls->lookahead.token;
638 }
639 
640