1 //===- ScriptLexer.cpp ----------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines a lexer for the linker script. 10 // 11 // The linker script's grammar is not complex but ambiguous due to the 12 // lack of the formal specification of the language. What we are trying to 13 // do in this and other files in LLD is to make a "reasonable" linker 14 // script processor. 15 // 16 // Among simplicity, compatibility and efficiency, we put the most 17 // emphasis on simplicity when we wrote this lexer. Compatibility with the 18 // GNU linkers is important, but we did not try to clone every tiny corner 19 // case of their lexers, as even ld.bfd and ld.gold are subtly different 20 // in various corner cases. We do not care much about efficiency because 21 // the time spent in parsing linker scripts is usually negligible. 22 // 23 // Overall, this lexer works fine for most linker scripts. There might 24 // be room for improving compatibility, but that's probably not at the 25 // top of our todo list. 26 // 27 //===----------------------------------------------------------------------===// 28 29 #include "ScriptLexer.h" 30 #include "Config.h" 31 #include "lld/Common/ErrorHandler.h" 32 #include "llvm/ADT/Twine.h" 33 #include "llvm/Support/ErrorHandling.h" 34 #include "llvm/Support/FileSystem.h" 35 #include "llvm/Support/Path.h" 36 #include <algorithm> 37 38 using namespace llvm; 39 using namespace lld; 40 using namespace lld::elf; 41 42 ScriptLexer::Buffer::Buffer(Ctx &ctx, MemoryBufferRef mb) 43 : s(mb.getBuffer()), filename(mb.getBufferIdentifier()), 44 begin(mb.getBufferStart()) { 45 if (ctx.arg.sysroot == "") 46 return; 47 StringRef path = filename; 48 for (; !path.empty(); path = sys::path::parent_path(path)) { 49 if (!sys::fs::equivalent(ctx.arg.sysroot, path)) 50 continue; 51 isUnderSysroot = true; 52 return; 53 } 54 } 55 56 ScriptLexer::ScriptLexer(Ctx &ctx, MemoryBufferRef mb) 57 : ctx(ctx), curBuf(ctx, mb), mbs(1, mb) { 58 activeFilenames.insert(mb.getBufferIdentifier()); 59 } 60 61 // Returns a whole line containing the current token. 62 StringRef ScriptLexer::getLine() { 63 StringRef s = getCurrentMB().getBuffer(); 64 65 size_t pos = s.rfind('\n', prevTok.data() - s.data()); 66 if (pos != StringRef::npos) 67 s = s.substr(pos + 1); 68 return s.substr(0, s.find_first_of("\r\n")); 69 } 70 71 // Returns 0-based column number of the current token. 72 size_t ScriptLexer::getColumnNumber() { 73 return prevTok.data() - getLine().data(); 74 } 75 76 std::string ScriptLexer::getCurrentLocation() { 77 std::string filename = std::string(getCurrentMB().getBufferIdentifier()); 78 return (filename + ":" + Twine(prevTokLine)).str(); 79 } 80 81 // We don't want to record cascading errors. Keep only the first one. 82 void ScriptLexer::setError(const Twine &msg) { 83 if (errCount(ctx)) 84 return; 85 86 std::string s = (getCurrentLocation() + ": " + msg).str(); 87 if (prevTok.size()) 88 s += "\n>>> " + getLine().str() + "\n>>> " + 89 std::string(getColumnNumber(), ' ') + "^"; 90 ErrAlways(ctx) << s; 91 } 92 93 void ScriptLexer::lex() { 94 for (;;) { 95 StringRef &s = curBuf.s; 96 s = skipSpace(s); 97 if (s.empty()) { 98 // If this buffer is from an INCLUDE command, switch to the "return 99 // value"; otherwise, mark EOF. 100 if (buffers.empty()) { 101 eof = true; 102 return; 103 } 104 activeFilenames.erase(curBuf.filename); 105 curBuf = buffers.pop_back_val(); 106 continue; 107 } 108 curTokState = inExpr; 109 110 // Quoted token. Note that double-quote characters are parts of a token 111 // because, in a glob match context, only unquoted tokens are interpreted 112 // as glob patterns. Double-quoted tokens are literal patterns in that 113 // context. 114 if (s.starts_with("\"")) { 115 size_t e = s.find("\"", 1); 116 if (e == StringRef::npos) { 117 size_t lineno = 118 StringRef(curBuf.begin, s.data() - curBuf.begin).count('\n'); 119 ErrAlways(ctx) << curBuf.filename << ":" << (lineno + 1) 120 << ": unclosed quote"; 121 return; 122 } 123 124 curTok = s.take_front(e + 1); 125 s = s.substr(e + 1); 126 return; 127 } 128 129 // Some operators form separate tokens. 130 if (s.starts_with("<<=") || s.starts_with(">>=")) { 131 curTok = s.substr(0, 3); 132 s = s.substr(3); 133 return; 134 } 135 if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) { 136 curTok = s.substr(0, 2); 137 s = s.substr(2); 138 return; 139 } 140 141 // Unquoted token. The non-expression token is more relaxed than tokens in 142 // C-like languages, so that you can write "file-name.cpp" as one bare 143 // token. 144 size_t pos; 145 if (inExpr) { 146 pos = s.find_first_not_of( 147 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 148 "0123456789_.$"); 149 if (pos == 0 && s.size() >= 2 && 150 ((s[0] == s[1] && strchr("<>&|", s[0])) || 151 is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2)))) 152 pos = 2; 153 } else { 154 pos = s.find_first_not_of( 155 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 156 "0123456789_.$/\\~=+[]*?-!^:"); 157 } 158 159 if (pos == 0) 160 pos = 1; 161 curTok = s.substr(0, pos); 162 s = s.substr(pos); 163 break; 164 } 165 } 166 167 // Skip leading whitespace characters or comments. 168 StringRef ScriptLexer::skipSpace(StringRef s) { 169 for (;;) { 170 if (s.starts_with("/*")) { 171 size_t e = s.find("*/", 2); 172 if (e == StringRef::npos) { 173 setError("unclosed comment in a linker script"); 174 return ""; 175 } 176 curBuf.lineNumber += s.substr(0, e).count('\n'); 177 s = s.substr(e + 2); 178 continue; 179 } 180 if (s.starts_with("#")) { 181 size_t e = s.find('\n', 1); 182 if (e == StringRef::npos) 183 e = s.size() - 1; 184 else 185 ++curBuf.lineNumber; 186 s = s.substr(e + 1); 187 continue; 188 } 189 StringRef saved = s; 190 s = s.ltrim(); 191 auto len = saved.size() - s.size(); 192 if (len == 0) 193 return s; 194 curBuf.lineNumber += saved.substr(0, len).count('\n'); 195 } 196 } 197 198 // Used to determine whether to stop parsing. Treat errors like EOF. 199 bool ScriptLexer::atEOF() { return eof || errCount(ctx); } 200 201 StringRef ScriptLexer::next() { 202 prevTok = peek(); 203 // `prevTokLine` is not updated for EOF so that the line number in `setError` 204 // will be more useful. 205 if (prevTok.size()) 206 prevTokLine = curBuf.lineNumber; 207 return std::exchange(curTok, StringRef(curBuf.s.data(), 0)); 208 } 209 210 StringRef ScriptLexer::peek() { 211 // curTok is invalid if curTokState and inExpr mismatch. 212 if (curTok.size() && curTokState != inExpr) { 213 curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data()); 214 curTok = {}; 215 } 216 if (curTok.empty()) 217 lex(); 218 return curTok; 219 } 220 221 bool ScriptLexer::consume(StringRef tok) { 222 if (peek() != tok) 223 return false; 224 next(); 225 return true; 226 } 227 228 void ScriptLexer::skip() { (void)next(); } 229 230 void ScriptLexer::expect(StringRef expect) { 231 if (errCount(ctx)) 232 return; 233 StringRef tok = next(); 234 if (tok != expect) { 235 if (atEOF()) 236 setError("unexpected EOF"); 237 else 238 setError(expect + " expected, but got " + tok); 239 } 240 } 241 242 ScriptLexer::Token ScriptLexer::till(StringRef tok) { 243 StringRef str = next(); 244 if (str == tok) 245 return {}; 246 if (!atEOF()) 247 return {str}; 248 prevTok = {}; 249 setError("unexpected EOF"); 250 return {}; 251 } 252 253 // Returns true if S encloses T. 254 static bool encloses(StringRef s, StringRef t) { 255 return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end(); 256 } 257 258 MemoryBufferRef ScriptLexer::getCurrentMB() { 259 // Find input buffer containing the current token. 260 assert(!mbs.empty()); 261 for (MemoryBufferRef mb : mbs) 262 if (encloses(mb.getBuffer(), curBuf.s)) 263 return mb; 264 llvm_unreachable("getCurrentMB: failed to find a token"); 265 } 266