1 //===--- Markup.cpp -----------------------------------------*- C++-*------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "support/Markup.h" 9 #include "llvm/ADT/ArrayRef.h" 10 #include "llvm/ADT/STLExtras.h" 11 #include "llvm/ADT/SmallVector.h" 12 #include "llvm/ADT/StringExtras.h" 13 #include "llvm/ADT/StringRef.h" 14 #include "llvm/Support/Compiler.h" 15 #include "llvm/Support/raw_ostream.h" 16 #include <cstddef> 17 #include <iterator> 18 #include <memory> 19 #include <string> 20 #include <vector> 21 22 namespace clang { 23 namespace clangd { 24 namespace markup { 25 namespace { 26 27 // Is <contents a plausible start to an HTML tag? 28 // Contents may not be the rest of the line, but it's the rest of the plain 29 // text, so we expect to see at least the tag name. 30 bool looksLikeTag(llvm::StringRef Contents) { 31 if (Contents.empty()) 32 return false; 33 if (Contents.front() == '!' || Contents.front() == '?' || 34 Contents.front() == '/') 35 return true; 36 // Check the start of the tag name. 37 if (!llvm::isAlpha(Contents.front())) 38 return false; 39 // Drop rest of the tag name, and following whitespace. 40 Contents = Contents 41 .drop_while([](char C) { 42 return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':'; 43 }) 44 .drop_while(llvm::isSpace); 45 // The rest of the tag consists of attributes, which have restrictive names. 46 // If we hit '=', all bets are off (attribute values can contain anything). 47 for (; !Contents.empty(); Contents = Contents.drop_front()) { 48 if (llvm::isAlnum(Contents.front()) || llvm::isSpace(Contents.front())) 49 continue; 50 if (Contents.front() == '>' || Contents.startswith("/>")) 51 return true; // May close the tag. 52 if (Contents.front() == '=') 53 return true; // Don't try to parse attribute values. 54 return false; // Random punctuation means this isn't a tag. 55 } 56 return true; // Potentially incomplete tag. 57 } 58 59 // Tests whether C should be backslash-escaped in markdown. 60 // The string being escaped is Before + C + After. This is part of a paragraph. 61 // StartsLine indicates whether `Before` is the start of the line. 62 // After may not be everything until the end of the line. 63 // 64 // It's always safe to escape punctuation, but want minimal escaping. 65 // The strategy is to escape the first character of anything that might start 66 // a markdown grammar construct. 67 bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After, 68 bool StartsLine) { 69 assert(Before.take_while(llvm::isSpace).empty()); 70 auto RulerLength = [&]() -> /*Length*/ unsigned { 71 if (!StartsLine || !Before.empty()) 72 return false; 73 llvm::StringRef A = After.rtrim(); 74 return llvm::all_of(A, [C](char D) { return C == D; }) ? 1 + A.size() : 0; 75 }; 76 auto IsBullet = [&]() { 77 return StartsLine && Before.empty() && 78 (After.empty() || After.startswith(" ")); 79 }; 80 auto SpaceSurrounds = [&]() { 81 return (After.empty() || llvm::isSpace(After.front())) && 82 (Before.empty() || llvm::isSpace(Before.back())); 83 }; 84 auto WordSurrounds = [&]() { 85 return (!After.empty() && llvm::isAlnum(After.front())) && 86 (!Before.empty() && llvm::isAlnum(Before.back())); 87 }; 88 89 switch (C) { 90 case '\\': // Escaped character. 91 return true; 92 case '`': // Code block or inline code 93 // Any number of backticks can delimit an inline code block that can end 94 // anywhere (including on another line). We must escape them all. 95 return true; 96 case '~': // Code block 97 return StartsLine && Before.empty() && After.startswith("~~"); 98 case '#': { // ATX heading. 99 if (!StartsLine || !Before.empty()) 100 return false; 101 llvm::StringRef Rest = After.ltrim(C); 102 return Rest.empty() || Rest.startswith(" "); 103 } 104 case ']': // Link or link reference. 105 // We escape ] rather than [ here, because it's more constrained: 106 // ](...) is an in-line link 107 // ]: is a link reference 108 // The following are only links if the link reference exists: 109 // ] by itself is a shortcut link 110 // ][...] is an out-of-line link 111 // Because we never emit link references, we don't need to handle these. 112 return After.startswith(":") || After.startswith("("); 113 case '=': // Setex heading. 114 return RulerLength() > 0; 115 case '_': // Horizontal ruler or matched delimiter. 116 if (RulerLength() >= 3) 117 return true; 118 // Not a delimiter if surrounded by space, or inside a word. 119 // (The rules at word boundaries are subtle). 120 return !(SpaceSurrounds() || WordSurrounds()); 121 case '-': // Setex heading, horizontal ruler, or bullet. 122 if (RulerLength() > 0) 123 return true; 124 return IsBullet(); 125 case '+': // Bullet list. 126 return IsBullet(); 127 case '*': // Bullet list, horizontal ruler, or delimiter. 128 return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds(); 129 case '<': // HTML tag (or autolink, which we choose not to escape) 130 return looksLikeTag(After); 131 case '>': // Quote marker. Needs escaping at start of line. 132 return StartsLine && Before.empty(); 133 case '&': { // HTML entity reference 134 auto End = After.find(';'); 135 if (End == llvm::StringRef::npos) 136 return false; 137 llvm::StringRef Content = After.substr(0, End); 138 if (Content.consume_front("#")) { 139 if (Content.consume_front("x") || Content.consume_front("X")) 140 return llvm::all_of(Content, llvm::isHexDigit); 141 return llvm::all_of(Content, llvm::isDigit); 142 } 143 return llvm::all_of(Content, llvm::isAlpha); 144 } 145 case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line. 146 case ')': 147 return StartsLine && !Before.empty() && 148 llvm::all_of(Before, llvm::isDigit) && After.startswith(" "); 149 default: 150 return false; 151 } 152 } 153 154 /// Escape a markdown text block. Ensures the punctuation will not introduce 155 /// any of the markdown constructs. 156 std::string renderText(llvm::StringRef Input, bool StartsLine) { 157 std::string R; 158 for (unsigned I = 0; I < Input.size(); ++I) { 159 if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1), 160 StartsLine)) 161 R.push_back('\\'); 162 R.push_back(Input[I]); 163 } 164 return R; 165 } 166 167 /// Renders \p Input as an inline block of code in markdown. The returned value 168 /// is surrounded by backticks and the inner contents are properly escaped. 169 std::string renderInlineBlock(llvm::StringRef Input) { 170 std::string R; 171 // Double all backticks to make sure we don't close the inline block early. 172 for (size_t From = 0; From < Input.size();) { 173 size_t Next = Input.find("`", From); 174 R += Input.substr(From, Next - From); 175 if (Next == llvm::StringRef::npos) 176 break; 177 R += "``"; // double the found backtick. 178 179 From = Next + 1; 180 } 181 // If results starts with a backtick, add spaces on both sides. The spaces 182 // are ignored by markdown renderers. 183 if (llvm::StringRef(R).startswith("`") || llvm::StringRef(R).endswith("`")) 184 return "` " + std::move(R) + " `"; 185 // Markdown render should ignore first and last space if both are there. We 186 // add an extra pair of spaces in that case to make sure we render what the 187 // user intended. 188 if (llvm::StringRef(R).startswith(" ") && llvm::StringRef(R).endswith(" ")) 189 return "` " + std::move(R) + " `"; 190 return "`" + std::move(R) + "`"; 191 } 192 193 /// Get marker required for \p Input to represent a markdown codeblock. It 194 /// consists of at least 3 backticks(`). Although markdown also allows to use 195 /// tilde(~) for code blocks, they are never used. 196 std::string getMarkerForCodeBlock(llvm::StringRef Input) { 197 // Count the maximum number of consecutive backticks in \p Input. We need to 198 // start and end the code block with more. 199 unsigned MaxBackticks = 0; 200 unsigned Backticks = 0; 201 for (char C : Input) { 202 if (C == '`') { 203 ++Backticks; 204 continue; 205 } 206 MaxBackticks = std::max(MaxBackticks, Backticks); 207 Backticks = 0; 208 } 209 MaxBackticks = std::max(Backticks, MaxBackticks); 210 // Use the corresponding number of backticks to start and end a code block. 211 return std::string(/*Repeat=*/std::max(3u, MaxBackticks + 1), '`'); 212 } 213 214 // Trims the input and concatenates whitespace blocks into a single ` `. 215 std::string canonicalizeSpaces(llvm::StringRef Input) { 216 llvm::SmallVector<llvm::StringRef> Words; 217 llvm::SplitString(Input, Words); 218 return llvm::join(Words, " "); 219 } 220 221 std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children, 222 void (Block::*RenderFunc)(llvm::raw_ostream &) const) { 223 std::string R; 224 llvm::raw_string_ostream OS(R); 225 226 // Trim rulers. 227 Children = Children.drop_while( 228 [](const std::unique_ptr<Block> &C) { return C->isRuler(); }); 229 auto Last = llvm::find_if( 230 llvm::reverse(Children), 231 [](const std::unique_ptr<Block> &C) { return !C->isRuler(); }); 232 Children = Children.drop_back(Children.end() - Last.base()); 233 234 bool LastBlockWasRuler = true; 235 for (const auto &C : Children) { 236 if (C->isRuler() && LastBlockWasRuler) 237 continue; 238 LastBlockWasRuler = C->isRuler(); 239 ((*C).*RenderFunc)(OS); 240 } 241 242 // Get rid of redundant empty lines introduced in plaintext while imitating 243 // padding in markdown. 244 std::string AdjustedResult; 245 llvm::StringRef TrimmedText(OS.str()); 246 TrimmedText = TrimmedText.trim(); 247 248 llvm::copy_if(TrimmedText, std::back_inserter(AdjustedResult), 249 [&TrimmedText](const char &C) { 250 return !llvm::StringRef(TrimmedText.data(), 251 &C - TrimmedText.data() + 1) 252 // We allow at most two newlines. 253 .endswith("\n\n\n"); 254 }); 255 256 return AdjustedResult; 257 } 258 259 // Separates two blocks with extra spacing. Note that it might render strangely 260 // in vscode if the trailing block is a codeblock, see 261 // https://github.com/microsoft/vscode/issues/88416 for details. 262 class Ruler : public Block { 263 public: 264 void renderMarkdown(llvm::raw_ostream &OS) const override { 265 // Note that we need an extra new line before the ruler, otherwise we might 266 // make previous block a title instead of introducing a ruler. 267 OS << "\n---\n"; 268 } 269 void renderPlainText(llvm::raw_ostream &OS) const override { OS << '\n'; } 270 std::unique_ptr<Block> clone() const override { 271 return std::make_unique<Ruler>(*this); 272 } 273 bool isRuler() const override { return true; } 274 }; 275 276 class CodeBlock : public Block { 277 public: 278 void renderMarkdown(llvm::raw_ostream &OS) const override { 279 std::string Marker = getMarkerForCodeBlock(Contents); 280 // No need to pad from previous blocks, as they should end with a new line. 281 OS << Marker << Language << '\n' << Contents << '\n' << Marker << '\n'; 282 } 283 284 void renderPlainText(llvm::raw_ostream &OS) const override { 285 // In plaintext we want one empty line before and after codeblocks. 286 OS << '\n' << Contents << "\n\n"; 287 } 288 289 std::unique_ptr<Block> clone() const override { 290 return std::make_unique<CodeBlock>(*this); 291 } 292 293 CodeBlock(std::string Contents, std::string Language) 294 : Contents(std::move(Contents)), Language(std::move(Language)) {} 295 296 private: 297 std::string Contents; 298 std::string Language; 299 }; 300 301 // Inserts two spaces after each `\n` to indent each line. First line is not 302 // indented. 303 std::string indentLines(llvm::StringRef Input) { 304 assert(!Input.endswith("\n") && "Input should've been trimmed."); 305 std::string IndentedR; 306 // We'll add 2 spaces after each new line. 307 IndentedR.reserve(Input.size() + Input.count('\n') * 2); 308 for (char C : Input) { 309 IndentedR += C; 310 if (C == '\n') 311 IndentedR.append(" "); 312 } 313 return IndentedR; 314 } 315 316 class Heading : public Paragraph { 317 public: 318 Heading(size_t Level) : Level(Level) {} 319 void renderMarkdown(llvm::raw_ostream &OS) const override { 320 OS << std::string(Level, '#') << ' '; 321 Paragraph::renderMarkdown(OS); 322 } 323 324 private: 325 size_t Level; 326 }; 327 328 } // namespace 329 330 std::string Block::asMarkdown() const { 331 std::string R; 332 llvm::raw_string_ostream OS(R); 333 renderMarkdown(OS); 334 return llvm::StringRef(OS.str()).trim().str(); 335 } 336 337 std::string Block::asPlainText() const { 338 std::string R; 339 llvm::raw_string_ostream OS(R); 340 renderPlainText(OS); 341 return llvm::StringRef(OS.str()).trim().str(); 342 } 343 344 void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const { 345 bool NeedsSpace = false; 346 bool HasChunks = false; 347 for (auto &C : Chunks) { 348 if (C.SpaceBefore || NeedsSpace) 349 OS << " "; 350 switch (C.Kind) { 351 case Chunk::PlainText: 352 OS << renderText(C.Contents, !HasChunks); 353 break; 354 case Chunk::InlineCode: 355 OS << renderInlineBlock(C.Contents); 356 break; 357 } 358 HasChunks = true; 359 NeedsSpace = C.SpaceAfter; 360 } 361 // Paragraphs are translated into markdown lines, not markdown paragraphs. 362 // Therefore it only has a single linebreak afterwards. 363 // VSCode requires two spaces at the end of line to start a new one. 364 OS << " \n"; 365 } 366 367 std::unique_ptr<Block> Paragraph::clone() const { 368 return std::make_unique<Paragraph>(*this); 369 } 370 371 /// Choose a marker to delimit `Text` from a prioritized list of options. 372 /// This is more readable than escaping for plain-text. 373 llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options, 374 llvm::StringRef Text) { 375 // Prefer a delimiter whose characters don't appear in the text. 376 for (llvm::StringRef S : Options) 377 if (Text.find_first_of(S) == llvm::StringRef::npos) 378 return S; 379 return Options.front(); 380 } 381 382 void Paragraph::renderPlainText(llvm::raw_ostream &OS) const { 383 bool NeedsSpace = false; 384 for (auto &C : Chunks) { 385 if (C.SpaceBefore || NeedsSpace) 386 OS << " "; 387 llvm::StringRef Marker = ""; 388 if (C.Preserve && C.Kind == Chunk::InlineCode) 389 Marker = chooseMarker({"`", "'", "\""}, C.Contents); 390 OS << Marker << C.Contents << Marker; 391 NeedsSpace = C.SpaceAfter; 392 } 393 OS << '\n'; 394 } 395 396 BulletList::BulletList() = default; 397 BulletList::~BulletList() = default; 398 399 void BulletList::renderMarkdown(llvm::raw_ostream &OS) const { 400 for (auto &D : Items) { 401 // Instead of doing this we might prefer passing Indent to children to get 402 // rid of the copies, if it turns out to be a bottleneck. 403 OS << "- " << indentLines(D.asMarkdown()) << '\n'; 404 } 405 // We need a new line after list to terminate it in markdown. 406 OS << '\n'; 407 } 408 409 void BulletList::renderPlainText(llvm::raw_ostream &OS) const { 410 for (auto &D : Items) { 411 // Instead of doing this we might prefer passing Indent to children to get 412 // rid of the copies, if it turns out to be a bottleneck. 413 OS << "- " << indentLines(D.asPlainText()) << '\n'; 414 } 415 } 416 417 Paragraph &Paragraph::appendSpace() { 418 if (!Chunks.empty()) 419 Chunks.back().SpaceAfter = true; 420 return *this; 421 } 422 423 Paragraph &Paragraph::appendText(llvm::StringRef Text) { 424 std::string Norm = canonicalizeSpaces(Text); 425 if (Norm.empty()) 426 return *this; 427 Chunks.emplace_back(); 428 Chunk &C = Chunks.back(); 429 C.Contents = std::move(Norm); 430 C.Kind = Chunk::PlainText; 431 C.SpaceBefore = llvm::isSpace(Text.front()); 432 C.SpaceAfter = llvm::isSpace(Text.back()); 433 return *this; 434 } 435 436 Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) { 437 bool AdjacentCode = 438 !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode; 439 std::string Norm = canonicalizeSpaces(std::move(Code)); 440 if (Norm.empty()) 441 return *this; 442 Chunks.emplace_back(); 443 Chunk &C = Chunks.back(); 444 C.Contents = std::move(Norm); 445 C.Kind = Chunk::InlineCode; 446 C.Preserve = Preserve; 447 // Disallow adjacent code spans without spaces, markdown can't render them. 448 C.SpaceBefore = AdjacentCode; 449 return *this; 450 } 451 452 std::unique_ptr<Block> BulletList::clone() const { 453 return std::make_unique<BulletList>(*this); 454 } 455 456 class Document &BulletList::addItem() { 457 Items.emplace_back(); 458 return Items.back(); 459 } 460 461 Document &Document::operator=(const Document &Other) { 462 Children.clear(); 463 for (const auto &C : Other.Children) 464 Children.push_back(C->clone()); 465 return *this; 466 } 467 468 void Document::append(Document Other) { 469 std::move(Other.Children.begin(), Other.Children.end(), 470 std::back_inserter(Children)); 471 } 472 473 Paragraph &Document::addParagraph() { 474 Children.push_back(std::make_unique<Paragraph>()); 475 return *static_cast<Paragraph *>(Children.back().get()); 476 } 477 478 void Document::addRuler() { Children.push_back(std::make_unique<Ruler>()); } 479 480 void Document::addCodeBlock(std::string Code, std::string Language) { 481 Children.emplace_back( 482 std::make_unique<CodeBlock>(std::move(Code), std::move(Language))); 483 } 484 485 std::string Document::asMarkdown() const { 486 return renderBlocks(Children, &Block::renderMarkdown); 487 } 488 489 std::string Document::asPlainText() const { 490 return renderBlocks(Children, &Block::renderPlainText); 491 } 492 493 BulletList &Document::addBulletList() { 494 Children.emplace_back(std::make_unique<BulletList>()); 495 return *static_cast<BulletList *>(Children.back().get()); 496 } 497 498 Paragraph &Document::addHeading(size_t Level) { 499 assert(Level > 0); 500 Children.emplace_back(std::make_unique<Heading>(Level)); 501 return *static_cast<Paragraph *>(Children.back().get()); 502 } 503 } // namespace markup 504 } // namespace clangd 505 } // namespace clang 506