1794366a2SRui Ueyama //===- ScriptLexer.cpp ----------------------------------------------------===// 2794366a2SRui Ueyama // 32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information. 52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6794366a2SRui Ueyama // 7794366a2SRui Ueyama //===----------------------------------------------------------------------===// 8794366a2SRui Ueyama // 94c82b4f6SRui Ueyama // This file defines a lexer for the linker script. 104c82b4f6SRui Ueyama // 114c82b4f6SRui Ueyama // The linker script's grammar is not complex but ambiguous due to the 124c82b4f6SRui Ueyama // lack of the formal specification of the language. What we are trying to 134c82b4f6SRui Ueyama // do in this and other files in LLD is to make a "reasonable" linker 144c82b4f6SRui Ueyama // script processor. 154c82b4f6SRui Ueyama // 164c82b4f6SRui Ueyama // Among simplicity, compatibility and efficiency, we put the most 174c82b4f6SRui Ueyama // emphasis on simplicity when we wrote this lexer. Compatibility with the 184c82b4f6SRui Ueyama // GNU linkers is important, but we did not try to clone every tiny corner 194c82b4f6SRui Ueyama // case of their lexers, as even ld.bfd and ld.gold are subtly different 204c82b4f6SRui Ueyama // in various corner cases. We do not care much about efficiency because 214c82b4f6SRui Ueyama // the time spent in parsing linker scripts is usually negligible. 224c82b4f6SRui Ueyama // 23731a66aeSRui Ueyama // Overall, this lexer works fine for most linker scripts. There might 24731a66aeSRui Ueyama // be room for improving compatibility, but that's probably not at the 25731a66aeSRui Ueyama // top of our todo list. 26794366a2SRui Ueyama // 27794366a2SRui Ueyama //===----------------------------------------------------------------------===// 28794366a2SRui Ueyama 29794366a2SRui Ueyama #include "ScriptLexer.h" 30a7e8bddfSFangrui Song #include "Config.h" 31b8a59c8aSBob Haarman #include "lld/Common/ErrorHandler.h" 32794366a2SRui Ueyama #include "llvm/ADT/Twine.h" 3327bb7990SFangrui Song #include "llvm/Support/ErrorHandling.h" 34a7e8bddfSFangrui Song #include "llvm/Support/FileSystem.h" 35a7e8bddfSFangrui Song #include "llvm/Support/Path.h" 3627bb7990SFangrui Song #include <algorithm> 37794366a2SRui Ueyama 38794366a2SRui Ueyama using namespace llvm; 3907837b8fSFangrui Song using namespace lld; 4007837b8fSFangrui Song using namespace lld::elf; 41794366a2SRui Ueyama 42cf57a670SFangrui Song ScriptLexer::Buffer::Buffer(Ctx &ctx, MemoryBufferRef mb) 43a7e8bddfSFangrui Song : s(mb.getBuffer()), filename(mb.getBufferIdentifier()), 44a7e8bddfSFangrui Song begin(mb.getBufferStart()) { 45cf57a670SFangrui Song if (ctx.arg.sysroot == "") 46a7e8bddfSFangrui Song return; 47a7e8bddfSFangrui Song StringRef path = filename; 48a7e8bddfSFangrui Song for (; !path.empty(); path = sys::path::parent_path(path)) { 49cf57a670SFangrui Song if (!sys::fs::equivalent(ctx.arg.sysroot, path)) 50a7e8bddfSFangrui Song continue; 51a7e8bddfSFangrui Song isUnderSysroot = true; 52a7e8bddfSFangrui Song return; 53a7e8bddfSFangrui Song } 54a7e8bddfSFangrui Song } 55a7e8bddfSFangrui Song 56cf57a670SFangrui Song ScriptLexer::ScriptLexer(Ctx &ctx, MemoryBufferRef mb) 57e24457a3SFangrui Song : ctx(ctx), curBuf(ctx, mb), mbs(1, mb) { 588f72b0cbSFangrui Song activeFilenames.insert(mb.getBufferIdentifier()); 598f72b0cbSFangrui Song } 601978c21dSFangrui Song 61794366a2SRui Ueyama // Returns a whole line containing the current token. 62794366a2SRui Ueyama StringRef ScriptLexer::getLine() { 633837f427SRui Ueyama StringRef s = getCurrentMB().getBuffer(); 64794366a2SRui Ueyama 651978c21dSFangrui Song size_t pos = s.rfind('\n', prevTok.data() - s.data()); 663837f427SRui Ueyama if (pos != StringRef::npos) 673837f427SRui Ueyama s = s.substr(pos + 1); 683837f427SRui Ueyama return s.substr(0, s.find_first_of("\r\n")); 69794366a2SRui Ueyama } 70794366a2SRui Ueyama 71794366a2SRui Ueyama // Returns 0-based column number of the current token. 72794366a2SRui Ueyama size_t ScriptLexer::getColumnNumber() { 731978c21dSFangrui Song return prevTok.data() - getLine().data(); 74794366a2SRui Ueyama } 75794366a2SRui Ueyama 76794366a2SRui Ueyama std::string ScriptLexer::getCurrentLocation() { 77adcd0268SBenjamin Kramer std::string filename = std::string(getCurrentMB().getBufferIdentifier()); 789328c20cSFangrui Song return (filename + ":" + Twine(prevTokLine)).str(); 79794366a2SRui Ueyama } 80794366a2SRui Ueyama 81794366a2SRui Ueyama // We don't want to record cascading errors. Keep only the first one. 823837f427SRui Ueyama void ScriptLexer::setError(const Twine &msg) { 83ed6c106eSFangrui Song if (errCount(ctx)) 84794366a2SRui Ueyama return; 85794366a2SRui Ueyama 863837f427SRui Ueyama std::string s = (getCurrentLocation() + ": " + msg).str(); 871978c21dSFangrui Song if (prevTok.size()) 883837f427SRui Ueyama s += "\n>>> " + getLine().str() + "\n>>> " + 89de2d1066SGeorge Rimar std::string(getColumnNumber(), ' ') + "^"; 9009c2c5e1SFangrui Song ErrAlways(ctx) << s; 91794366a2SRui Ueyama } 92794366a2SRui Ueyama 931978c21dSFangrui Song void ScriptLexer::lex() { 94794366a2SRui Ueyama for (;;) { 951978c21dSFangrui Song StringRef &s = curBuf.s; 963837f427SRui Ueyama s = skipSpace(s); 971978c21dSFangrui Song if (s.empty()) { 981978c21dSFangrui Song // If this buffer is from an INCLUDE command, switch to the "return 991978c21dSFangrui Song // value"; otherwise, mark EOF. 1001978c21dSFangrui Song if (buffers.empty()) { 1011978c21dSFangrui Song eof = true; 1021978c21dSFangrui Song return; 1031978c21dSFangrui Song } 1048f72b0cbSFangrui Song activeFilenames.erase(curBuf.filename); 1051978c21dSFangrui Song curBuf = buffers.pop_back_val(); 1061978c21dSFangrui Song continue; 1071978c21dSFangrui Song } 1081978c21dSFangrui Song curTokState = inExpr; 109794366a2SRui Ueyama 110794366a2SRui Ueyama // Quoted token. Note that double-quote characters are parts of a token 111794366a2SRui Ueyama // because, in a glob match context, only unquoted tokens are interpreted 112794366a2SRui Ueyama // as glob patterns. Double-quoted tokens are literal patterns in that 113794366a2SRui Ueyama // context. 1148d85c96eSFangrui Song if (s.starts_with("\"")) { 1153837f427SRui Ueyama size_t e = s.find("\"", 1); 1163837f427SRui Ueyama if (e == StringRef::npos) { 1171978c21dSFangrui Song size_t lineno = 1181978c21dSFangrui Song StringRef(curBuf.begin, s.data() - curBuf.begin).count('\n'); 119*483516fdSFangrui Song ErrAlways(ctx) << curBuf.filename << ":" << (lineno + 1) 12009c2c5e1SFangrui Song << ": unclosed quote"; 121794366a2SRui Ueyama return; 122794366a2SRui Ueyama } 123794366a2SRui Ueyama 1241978c21dSFangrui Song curTok = s.take_front(e + 1); 1253837f427SRui Ueyama s = s.substr(e + 1); 1261978c21dSFangrui Song return; 127794366a2SRui Ueyama } 128794366a2SRui Ueyama 1290a0effddSFangrui Song // Some operators form separate tokens. 1308d85c96eSFangrui Song if (s.starts_with("<<=") || s.starts_with(">>=")) { 1311978c21dSFangrui Song curTok = s.substr(0, 3); 1320a0effddSFangrui Song s = s.substr(3); 1331978c21dSFangrui Song return; 1340a0effddSFangrui Song } 1351978c21dSFangrui Song if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) { 1361978c21dSFangrui Song curTok = s.substr(0, 2); 1373837f427SRui Ueyama s = s.substr(2); 1381978c21dSFangrui Song return; 139c67d6b2dSRui Ueyama } 140c67d6b2dSRui Ueyama 1411978c21dSFangrui Song // Unquoted token. The non-expression token is more relaxed than tokens in 1421978c21dSFangrui Song // C-like languages, so that you can write "file-name.cpp" as one bare 1431978c21dSFangrui Song // token. 1441978c21dSFangrui Song size_t pos; 1451978c21dSFangrui Song if (inExpr) { 1461978c21dSFangrui Song pos = s.find_first_not_of( 1471978c21dSFangrui Song "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 1481978c21dSFangrui Song "0123456789_.$"); 1491978c21dSFangrui Song if (pos == 0 && s.size() >= 2 && 1501978c21dSFangrui Song ((s[0] == s[1] && strchr("<>&|", s[0])) || 1511978c21dSFangrui Song is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2)))) 1521978c21dSFangrui Song pos = 2; 1531978c21dSFangrui Song } else { 1541978c21dSFangrui Song pos = s.find_first_not_of( 155794366a2SRui Ueyama "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 156c67d6b2dSRui Ueyama "0123456789_.$/\\~=+[]*?-!^:"); 157794366a2SRui Ueyama } 158794366a2SRui Ueyama 1591978c21dSFangrui Song if (pos == 0) 1601978c21dSFangrui Song pos = 1; 1611978c21dSFangrui Song curTok = s.substr(0, pos); 1621978c21dSFangrui Song s = s.substr(pos); 1631978c21dSFangrui Song break; 1641978c21dSFangrui Song } 165794366a2SRui Ueyama } 166794366a2SRui Ueyama 167794366a2SRui Ueyama // Skip leading whitespace characters or comments. 1683837f427SRui Ueyama StringRef ScriptLexer::skipSpace(StringRef s) { 169794366a2SRui Ueyama for (;;) { 1708d85c96eSFangrui Song if (s.starts_with("/*")) { 1713837f427SRui Ueyama size_t e = s.find("*/", 2); 1723837f427SRui Ueyama if (e == StringRef::npos) { 173ae4279bdSGeorgii Rymar setError("unclosed comment in a linker script"); 174794366a2SRui Ueyama return ""; 175794366a2SRui Ueyama } 1769328c20cSFangrui Song curBuf.lineNumber += s.substr(0, e).count('\n'); 1773837f427SRui Ueyama s = s.substr(e + 2); 178794366a2SRui Ueyama continue; 179794366a2SRui Ueyama } 1808d85c96eSFangrui Song if (s.starts_with("#")) { 1813837f427SRui Ueyama size_t e = s.find('\n', 1); 1823837f427SRui Ueyama if (e == StringRef::npos) 1833837f427SRui Ueyama e = s.size() - 1; 1849328c20cSFangrui Song else 1859328c20cSFangrui Song ++curBuf.lineNumber; 1863837f427SRui Ueyama s = s.substr(e + 1); 187794366a2SRui Ueyama continue; 188794366a2SRui Ueyama } 1899328c20cSFangrui Song StringRef saved = s; 1903837f427SRui Ueyama s = s.ltrim(); 1919328c20cSFangrui Song auto len = saved.size() - s.size(); 1929328c20cSFangrui Song if (len == 0) 1933837f427SRui Ueyama return s; 1949328c20cSFangrui Song curBuf.lineNumber += saved.substr(0, len).count('\n'); 195794366a2SRui Ueyama } 196794366a2SRui Ueyama } 197794366a2SRui Ueyama 1981978c21dSFangrui Song // Used to determine whether to stop parsing. Treat errors like EOF. 199ed6c106eSFangrui Song bool ScriptLexer::atEOF() { return eof || errCount(ctx); } 200731a66aeSRui Ueyama 201794366a2SRui Ueyama StringRef ScriptLexer::next() { 2021978c21dSFangrui Song prevTok = peek(); 2039328c20cSFangrui Song // `prevTokLine` is not updated for EOF so that the line number in `setError` 2049328c20cSFangrui Song // will be more useful. 2059328c20cSFangrui Song if (prevTok.size()) 2069328c20cSFangrui Song prevTokLine = curBuf.lineNumber; 2071978c21dSFangrui Song return std::exchange(curTok, StringRef(curBuf.s.data(), 0)); 208794366a2SRui Ueyama } 209794366a2SRui Ueyama 210f5fce486SRui Ueyama StringRef ScriptLexer::peek() { 2111978c21dSFangrui Song // curTok is invalid if curTokState and inExpr mismatch. 2121978c21dSFangrui Song if (curTok.size() && curTokState != inExpr) { 2131978c21dSFangrui Song curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data()); 2141978c21dSFangrui Song curTok = {}; 2151978c21dSFangrui Song } 2161978c21dSFangrui Song if (curTok.empty()) 2171978c21dSFangrui Song lex(); 2181978c21dSFangrui Song return curTok; 219794366a2SRui Ueyama } 220794366a2SRui Ueyama 2213837f427SRui Ueyama bool ScriptLexer::consume(StringRef tok) { 2221978c21dSFangrui Song if (peek() != tok) 223794366a2SRui Ueyama return false; 2241978c21dSFangrui Song next(); 2251978c21dSFangrui Song return true; 226794366a2SRui Ueyama } 227794366a2SRui Ueyama 228794366a2SRui Ueyama void ScriptLexer::skip() { (void)next(); } 229794366a2SRui Ueyama 2303837f427SRui Ueyama void ScriptLexer::expect(StringRef expect) { 231ed6c106eSFangrui Song if (errCount(ctx)) 232794366a2SRui Ueyama return; 2333837f427SRui Ueyama StringRef tok = next(); 2341978c21dSFangrui Song if (tok != expect) { 2351978c21dSFangrui Song if (atEOF()) 2361978c21dSFangrui Song setError("unexpected EOF"); 2371978c21dSFangrui Song else 2383837f427SRui Ueyama setError(expect + " expected, but got " + tok); 239794366a2SRui Ueyama } 2401978c21dSFangrui Song } 241794366a2SRui Ueyama 2422a89356dSFangrui Song ScriptLexer::Token ScriptLexer::till(StringRef tok) { 2432a89356dSFangrui Song StringRef str = next(); 2442a89356dSFangrui Song if (str == tok) 2452a89356dSFangrui Song return {}; 2462a89356dSFangrui Song if (!atEOF()) 2472a89356dSFangrui Song return {str}; 2482a89356dSFangrui Song prevTok = {}; 2492a89356dSFangrui Song setError("unexpected EOF"); 2502a89356dSFangrui Song return {}; 2512a89356dSFangrui Song } 2522a89356dSFangrui Song 253794366a2SRui Ueyama // Returns true if S encloses T. 2543837f427SRui Ueyama static bool encloses(StringRef s, StringRef t) { 2553837f427SRui Ueyama return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end(); 256794366a2SRui Ueyama } 257794366a2SRui Ueyama 258794366a2SRui Ueyama MemoryBufferRef ScriptLexer::getCurrentMB() { 259794366a2SRui Ueyama // Find input buffer containing the current token. 260ac6abc99SFangrui Song assert(!mbs.empty()); 2613837f427SRui Ueyama for (MemoryBufferRef mb : mbs) 2621978c21dSFangrui Song if (encloses(mb.getBuffer(), curBuf.s)) 2633837f427SRui Ueyama return mb; 264794366a2SRui Ueyama llvm_unreachable("getCurrentMB: failed to find a token"); 265794366a2SRui Ueyama } 266