10b57cec5SDimitry Andric //===- ScriptLexer.cpp ----------------------------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This file defines a lexer for the linker script. 100b57cec5SDimitry Andric // 110b57cec5SDimitry Andric // The linker script's grammar is not complex but ambiguous due to the 120b57cec5SDimitry Andric // lack of the formal specification of the language. What we are trying to 130b57cec5SDimitry Andric // do in this and other files in LLD is to make a "reasonable" linker 140b57cec5SDimitry Andric // script processor. 150b57cec5SDimitry Andric // 160b57cec5SDimitry Andric // Among simplicity, compatibility and efficiency, we put the most 170b57cec5SDimitry Andric // emphasis on simplicity when we wrote this lexer. Compatibility with the 180b57cec5SDimitry Andric // GNU linkers is important, but we did not try to clone every tiny corner 190b57cec5SDimitry Andric // case of their lexers, as even ld.bfd and ld.gold are subtly different 200b57cec5SDimitry Andric // in various corner cases. We do not care much about efficiency because 210b57cec5SDimitry Andric // the time spent in parsing linker scripts is usually negligible. 220b57cec5SDimitry Andric // 230b57cec5SDimitry Andric // Overall, this lexer works fine for most linker scripts. There might 240b57cec5SDimitry Andric // be room for improving compatibility, but that's probably not at the 250b57cec5SDimitry Andric // top of our todo list. 260b57cec5SDimitry Andric // 270b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 280b57cec5SDimitry Andric 290b57cec5SDimitry Andric #include "ScriptLexer.h" 300b57cec5SDimitry Andric #include "lld/Common/ErrorHandler.h" 310b57cec5SDimitry Andric #include "llvm/ADT/Twine.h" 3281ad6265SDimitry Andric #include "llvm/Support/ErrorHandling.h" 3381ad6265SDimitry Andric #include <algorithm> 340b57cec5SDimitry Andric 350b57cec5SDimitry Andric using namespace llvm; 365ffd83dbSDimitry Andric using namespace lld; 375ffd83dbSDimitry Andric using namespace lld::elf; 380b57cec5SDimitry Andric 390b57cec5SDimitry Andric // Returns a whole line containing the current token. 400b57cec5SDimitry Andric StringRef ScriptLexer::getLine() { 410b57cec5SDimitry Andric StringRef s = getCurrentMB().getBuffer(); 420b57cec5SDimitry Andric StringRef tok = tokens[pos - 1]; 430b57cec5SDimitry Andric 440b57cec5SDimitry Andric size_t pos = s.rfind('\n', tok.data() - s.data()); 450b57cec5SDimitry Andric if (pos != StringRef::npos) 460b57cec5SDimitry Andric s = s.substr(pos + 1); 470b57cec5SDimitry Andric return s.substr(0, s.find_first_of("\r\n")); 480b57cec5SDimitry Andric } 490b57cec5SDimitry Andric 500b57cec5SDimitry Andric // Returns 1-based line number of the current token. 510b57cec5SDimitry Andric size_t ScriptLexer::getLineNumber() { 52e837bb5cSDimitry Andric if (pos == 0) 53e837bb5cSDimitry Andric return 1; 540b57cec5SDimitry Andric StringRef s = getCurrentMB().getBuffer(); 550b57cec5SDimitry Andric StringRef tok = tokens[pos - 1]; 56fe6060f1SDimitry Andric const size_t tokOffset = tok.data() - s.data(); 57fe6060f1SDimitry Andric 58fe6060f1SDimitry Andric // For the first token, or when going backwards, start from the beginning of 59fe6060f1SDimitry Andric // the buffer. If this token is after the previous token, start from the 60fe6060f1SDimitry Andric // previous token. 61fe6060f1SDimitry Andric size_t line = 1; 62fe6060f1SDimitry Andric size_t start = 0; 63fe6060f1SDimitry Andric if (lastLineNumberOffset > 0 && tokOffset >= lastLineNumberOffset) { 64fe6060f1SDimitry Andric start = lastLineNumberOffset; 65fe6060f1SDimitry Andric line = lastLineNumber; 66fe6060f1SDimitry Andric } 67fe6060f1SDimitry Andric 68fe6060f1SDimitry Andric line += s.substr(start, tokOffset - start).count('\n'); 69fe6060f1SDimitry Andric 70fe6060f1SDimitry Andric // Store the line number of this token for reuse. 71fe6060f1SDimitry Andric lastLineNumberOffset = tokOffset; 72fe6060f1SDimitry Andric lastLineNumber = line; 73fe6060f1SDimitry Andric 74fe6060f1SDimitry Andric return line; 750b57cec5SDimitry Andric } 760b57cec5SDimitry Andric 770b57cec5SDimitry Andric // Returns 0-based column number of the current token. 780b57cec5SDimitry Andric size_t ScriptLexer::getColumnNumber() { 790b57cec5SDimitry Andric StringRef tok = tokens[pos - 1]; 800b57cec5SDimitry Andric return tok.data() - getLine().data(); 810b57cec5SDimitry Andric } 820b57cec5SDimitry Andric 830b57cec5SDimitry Andric std::string ScriptLexer::getCurrentLocation() { 845ffd83dbSDimitry Andric std::string filename = std::string(getCurrentMB().getBufferIdentifier()); 850b57cec5SDimitry Andric return (filename + ":" + Twine(getLineNumber())).str(); 860b57cec5SDimitry Andric } 870b57cec5SDimitry Andric 880b57cec5SDimitry Andric ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); } 890b57cec5SDimitry Andric 900b57cec5SDimitry Andric // We don't want to record cascading errors. Keep only the first one. 910b57cec5SDimitry Andric void ScriptLexer::setError(const Twine &msg) { 920b57cec5SDimitry Andric if (errorCount()) 930b57cec5SDimitry Andric return; 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric std::string s = (getCurrentLocation() + ": " + msg).str(); 960b57cec5SDimitry Andric if (pos) 970b57cec5SDimitry Andric s += "\n>>> " + getLine().str() + "\n>>> " + 980b57cec5SDimitry Andric std::string(getColumnNumber(), ' ') + "^"; 990b57cec5SDimitry Andric error(s); 1000b57cec5SDimitry Andric } 1010b57cec5SDimitry Andric 1020b57cec5SDimitry Andric // Split S into linker script tokens. 1030b57cec5SDimitry Andric void ScriptLexer::tokenize(MemoryBufferRef mb) { 1040b57cec5SDimitry Andric std::vector<StringRef> vec; 1050b57cec5SDimitry Andric mbs.push_back(mb); 1060b57cec5SDimitry Andric StringRef s = mb.getBuffer(); 1070b57cec5SDimitry Andric StringRef begin = s; 1080b57cec5SDimitry Andric 1090b57cec5SDimitry Andric for (;;) { 1100b57cec5SDimitry Andric s = skipSpace(s); 1110b57cec5SDimitry Andric if (s.empty()) 1120b57cec5SDimitry Andric break; 1130b57cec5SDimitry Andric 1140b57cec5SDimitry Andric // Quoted token. Note that double-quote characters are parts of a token 1150b57cec5SDimitry Andric // because, in a glob match context, only unquoted tokens are interpreted 1160b57cec5SDimitry Andric // as glob patterns. Double-quoted tokens are literal patterns in that 1170b57cec5SDimitry Andric // context. 11806c3fb27SDimitry Andric if (s.starts_with("\"")) { 1190b57cec5SDimitry Andric size_t e = s.find("\"", 1); 1200b57cec5SDimitry Andric if (e == StringRef::npos) { 1210b57cec5SDimitry Andric StringRef filename = mb.getBufferIdentifier(); 1220b57cec5SDimitry Andric size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n'); 1230b57cec5SDimitry Andric error(filename + ":" + Twine(lineno + 1) + ": unclosed quote"); 1240b57cec5SDimitry Andric return; 1250b57cec5SDimitry Andric } 1260b57cec5SDimitry Andric 1270b57cec5SDimitry Andric vec.push_back(s.take_front(e + 1)); 1280b57cec5SDimitry Andric s = s.substr(e + 1); 1290b57cec5SDimitry Andric continue; 1300b57cec5SDimitry Andric } 1310b57cec5SDimitry Andric 13281ad6265SDimitry Andric // Some operators form separate tokens. 13306c3fb27SDimitry Andric if (s.starts_with("<<=") || s.starts_with(">>=")) { 13481ad6265SDimitry Andric vec.push_back(s.substr(0, 3)); 13581ad6265SDimitry Andric s = s.substr(3); 13681ad6265SDimitry Andric continue; 13781ad6265SDimitry Andric } 13806c3fb27SDimitry Andric if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) || 13981ad6265SDimitry Andric (s[0] == s[1] && strchr("<>&|", s[0])))) { 1400b57cec5SDimitry Andric vec.push_back(s.substr(0, 2)); 1410b57cec5SDimitry Andric s = s.substr(2); 1420b57cec5SDimitry Andric continue; 1430b57cec5SDimitry Andric } 1440b57cec5SDimitry Andric 1450b57cec5SDimitry Andric // Unquoted token. This is more relaxed than tokens in C-like language, 1460b57cec5SDimitry Andric // so that you can write "file-name.cpp" as one bare token, for example. 1470b57cec5SDimitry Andric size_t pos = s.find_first_not_of( 1480b57cec5SDimitry Andric "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 1490b57cec5SDimitry Andric "0123456789_.$/\\~=+[]*?-!^:"); 1500b57cec5SDimitry Andric 1510b57cec5SDimitry Andric // A character that cannot start a word (which is usually a 1520b57cec5SDimitry Andric // punctuation) forms a single character token. 1530b57cec5SDimitry Andric if (pos == 0) 1540b57cec5SDimitry Andric pos = 1; 1550b57cec5SDimitry Andric vec.push_back(s.substr(0, pos)); 1560b57cec5SDimitry Andric s = s.substr(pos); 1570b57cec5SDimitry Andric } 1580b57cec5SDimitry Andric 1590b57cec5SDimitry Andric tokens.insert(tokens.begin() + pos, vec.begin(), vec.end()); 1600b57cec5SDimitry Andric } 1610b57cec5SDimitry Andric 1620b57cec5SDimitry Andric // Skip leading whitespace characters or comments. 1630b57cec5SDimitry Andric StringRef ScriptLexer::skipSpace(StringRef s) { 1640b57cec5SDimitry Andric for (;;) { 16506c3fb27SDimitry Andric if (s.starts_with("/*")) { 1660b57cec5SDimitry Andric size_t e = s.find("*/", 2); 1670b57cec5SDimitry Andric if (e == StringRef::npos) { 168e8d8bef9SDimitry Andric setError("unclosed comment in a linker script"); 1690b57cec5SDimitry Andric return ""; 1700b57cec5SDimitry Andric } 1710b57cec5SDimitry Andric s = s.substr(e + 2); 1720b57cec5SDimitry Andric continue; 1730b57cec5SDimitry Andric } 17406c3fb27SDimitry Andric if (s.starts_with("#")) { 1750b57cec5SDimitry Andric size_t e = s.find('\n', 1); 1760b57cec5SDimitry Andric if (e == StringRef::npos) 1770b57cec5SDimitry Andric e = s.size() - 1; 1780b57cec5SDimitry Andric s = s.substr(e + 1); 1790b57cec5SDimitry Andric continue; 1800b57cec5SDimitry Andric } 1810b57cec5SDimitry Andric size_t size = s.size(); 1820b57cec5SDimitry Andric s = s.ltrim(); 1830b57cec5SDimitry Andric if (s.size() == size) 1840b57cec5SDimitry Andric return s; 1850b57cec5SDimitry Andric } 1860b57cec5SDimitry Andric } 1870b57cec5SDimitry Andric 1880b57cec5SDimitry Andric // An erroneous token is handled as if it were the last token before EOF. 1890b57cec5SDimitry Andric bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; } 1900b57cec5SDimitry Andric 1910b57cec5SDimitry Andric // Split a given string as an expression. 1920b57cec5SDimitry Andric // This function returns "3", "*" and "5" for "3*5" for example. 1930b57cec5SDimitry Andric static std::vector<StringRef> tokenizeExpr(StringRef s) { 19406c3fb27SDimitry Andric StringRef ops = "!~*/+-<>?^:="; // List of operators 1950b57cec5SDimitry Andric 1960b57cec5SDimitry Andric // Quoted strings are literal strings, so we don't want to split it. 19706c3fb27SDimitry Andric if (s.starts_with("\"")) 1980b57cec5SDimitry Andric return {s}; 1990b57cec5SDimitry Andric 2000b57cec5SDimitry Andric // Split S with operators as separators. 2010b57cec5SDimitry Andric std::vector<StringRef> ret; 2020b57cec5SDimitry Andric while (!s.empty()) { 2030b57cec5SDimitry Andric size_t e = s.find_first_of(ops); 2040b57cec5SDimitry Andric 2050b57cec5SDimitry Andric // No need to split if there is no operator. 2060b57cec5SDimitry Andric if (e == StringRef::npos) { 2070b57cec5SDimitry Andric ret.push_back(s); 2080b57cec5SDimitry Andric break; 2090b57cec5SDimitry Andric } 2100b57cec5SDimitry Andric 2115ffd83dbSDimitry Andric // Get a token before the operator. 2120b57cec5SDimitry Andric if (e != 0) 2130b57cec5SDimitry Andric ret.push_back(s.substr(0, e)); 2140b57cec5SDimitry Andric 2150b57cec5SDimitry Andric // Get the operator as a token. 2160b57cec5SDimitry Andric // Keep !=, ==, >=, <=, << and >> operators as a single tokens. 21706c3fb27SDimitry Andric if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") || 21806c3fb27SDimitry Andric s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") || 21906c3fb27SDimitry Andric s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) { 2200b57cec5SDimitry Andric ret.push_back(s.substr(e, 2)); 2210b57cec5SDimitry Andric s = s.substr(e + 2); 2220b57cec5SDimitry Andric } else { 2230b57cec5SDimitry Andric ret.push_back(s.substr(e, 1)); 2240b57cec5SDimitry Andric s = s.substr(e + 1); 2250b57cec5SDimitry Andric } 2260b57cec5SDimitry Andric } 2270b57cec5SDimitry Andric return ret; 2280b57cec5SDimitry Andric } 2290b57cec5SDimitry Andric 2300b57cec5SDimitry Andric // In contexts where expressions are expected, the lexer should apply 2310b57cec5SDimitry Andric // different tokenization rules than the default one. By default, 2320b57cec5SDimitry Andric // arithmetic operator characters are regular characters, but in the 2330b57cec5SDimitry Andric // expression context, they should be independent tokens. 2340b57cec5SDimitry Andric // 2350b57cec5SDimitry Andric // For example, "foo*3" should be tokenized to "foo", "*" and "3" only 2360b57cec5SDimitry Andric // in the expression context. 2370b57cec5SDimitry Andric // 2380b57cec5SDimitry Andric // This function may split the current token into multiple tokens. 2390b57cec5SDimitry Andric void ScriptLexer::maybeSplitExpr() { 2400b57cec5SDimitry Andric if (!inExpr || errorCount() || atEOF()) 2410b57cec5SDimitry Andric return; 2420b57cec5SDimitry Andric 2430b57cec5SDimitry Andric std::vector<StringRef> v = tokenizeExpr(tokens[pos]); 2440b57cec5SDimitry Andric if (v.size() == 1) 2450b57cec5SDimitry Andric return; 2460b57cec5SDimitry Andric tokens.erase(tokens.begin() + pos); 2470b57cec5SDimitry Andric tokens.insert(tokens.begin() + pos, v.begin(), v.end()); 2480b57cec5SDimitry Andric } 2490b57cec5SDimitry Andric 2500b57cec5SDimitry Andric StringRef ScriptLexer::next() { 2510b57cec5SDimitry Andric maybeSplitExpr(); 2520b57cec5SDimitry Andric 2530b57cec5SDimitry Andric if (errorCount()) 2540b57cec5SDimitry Andric return ""; 2550b57cec5SDimitry Andric if (atEOF()) { 2560b57cec5SDimitry Andric setError("unexpected EOF"); 2570b57cec5SDimitry Andric return ""; 2580b57cec5SDimitry Andric } 2590b57cec5SDimitry Andric return tokens[pos++]; 2600b57cec5SDimitry Andric } 2610b57cec5SDimitry Andric 2620b57cec5SDimitry Andric StringRef ScriptLexer::peek() { 2630b57cec5SDimitry Andric StringRef tok = next(); 2640b57cec5SDimitry Andric if (errorCount()) 2650b57cec5SDimitry Andric return ""; 2660b57cec5SDimitry Andric pos = pos - 1; 2670b57cec5SDimitry Andric return tok; 2680b57cec5SDimitry Andric } 2690b57cec5SDimitry Andric 2700b57cec5SDimitry Andric bool ScriptLexer::consume(StringRef tok) { 271*0fca6ea1SDimitry Andric if (next() == tok) 2720b57cec5SDimitry Andric return true; 273*0fca6ea1SDimitry Andric --pos; 2740b57cec5SDimitry Andric return false; 2750b57cec5SDimitry Andric } 2760b57cec5SDimitry Andric 2770b57cec5SDimitry Andric // Consumes Tok followed by ":". Space is allowed between Tok and ":". 2780b57cec5SDimitry Andric bool ScriptLexer::consumeLabel(StringRef tok) { 2790b57cec5SDimitry Andric if (consume((tok + ":").str())) 2800b57cec5SDimitry Andric return true; 2810b57cec5SDimitry Andric if (tokens.size() >= pos + 2 && tokens[pos] == tok && 2820b57cec5SDimitry Andric tokens[pos + 1] == ":") { 2830b57cec5SDimitry Andric pos += 2; 2840b57cec5SDimitry Andric return true; 2850b57cec5SDimitry Andric } 2860b57cec5SDimitry Andric return false; 2870b57cec5SDimitry Andric } 2880b57cec5SDimitry Andric 2890b57cec5SDimitry Andric void ScriptLexer::skip() { (void)next(); } 2900b57cec5SDimitry Andric 2910b57cec5SDimitry Andric void ScriptLexer::expect(StringRef expect) { 2920b57cec5SDimitry Andric if (errorCount()) 2930b57cec5SDimitry Andric return; 2940b57cec5SDimitry Andric StringRef tok = next(); 2950b57cec5SDimitry Andric if (tok != expect) 2960b57cec5SDimitry Andric setError(expect + " expected, but got " + tok); 2970b57cec5SDimitry Andric } 2980b57cec5SDimitry Andric 2990b57cec5SDimitry Andric // Returns true if S encloses T. 3000b57cec5SDimitry Andric static bool encloses(StringRef s, StringRef t) { 3010b57cec5SDimitry Andric return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end(); 3020b57cec5SDimitry Andric } 3030b57cec5SDimitry Andric 3040b57cec5SDimitry Andric MemoryBufferRef ScriptLexer::getCurrentMB() { 3050b57cec5SDimitry Andric // Find input buffer containing the current token. 306e837bb5cSDimitry Andric assert(!mbs.empty()); 307e837bb5cSDimitry Andric if (pos == 0) 308e837bb5cSDimitry Andric return mbs.back(); 3090b57cec5SDimitry Andric for (MemoryBufferRef mb : mbs) 3100b57cec5SDimitry Andric if (encloses(mb.getBuffer(), tokens[pos - 1])) 3110b57cec5SDimitry Andric return mb; 3120b57cec5SDimitry Andric llvm_unreachable("getCurrentMB: failed to find a token"); 3130b57cec5SDimitry Andric } 314