PageRenderTime 40ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/src/ServiceStack/WebHost.EndPoints/Support/Markdown/Markdown.cs

http://github.com/ServiceStack/ServiceStack
C# | 1668 lines | 983 code | 226 blank | 459 comment | 54 complexity | ed694c0cb1b22a50394835a79662bade MD5 | raw file
Possible License(s): BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. * MarkdownSharp
  3. * -------------
  4. * a C# Markdown processor
  5. *
  6. * Markdown is a text-to-HTML conversion tool for web writers
  7. * Copyright (c) 2004 John Gruber
  8. * http://daringfireball.net/projects/markdown/
  9. *
  10. * Markdown.NET
  11. * Copyright (c) 2004-2009 Milan Negovan
  12. * http://www.aspnetresources.com
  13. * http://aspnetresources.com/blog/markdown_announced.aspx
  14. *
  15. * MarkdownSharp
  16. * Copyright (c) 2009-2011 Jeff Atwood
  17. * http://stackoverflow.com
  18. * http://www.codinghorror.com/blog/
  19. * http://code.google.com/p/markdownsharp/
  20. *
  21. * History: Milan ported the Markdown processor to C#. He granted license to me so I can open source it
  22. * and let the community contribute to and improve MarkdownSharp.
  23. *
  24. */
  25. #region Copyright and license
  26. /*
  27. Copyright (c) 2009 - 2010 Jeff Atwood
  28. http://www.opensource.org/licenses/mit-license.php
  29. Permission is hereby granted, free of charge, to any person obtaining a copy
  30. of this software and associated documentation files (the "Software"), to deal
  31. in the Software without restriction, including without limitation the rights
  32. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  33. copies of the Software, and to permit persons to whom the Software is
  34. furnished to do so, subject to the following conditions:
  35. The above copyright notice and this permission notice shall be included in
  36. all copies or substantial portions of the Software.
  37. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  38. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  39. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  40. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  41. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  42. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  43. THE SOFTWARE.
  44. Copyright (c) 2003-2004 John Gruber
  45. <http://daringfireball.net/>
  46. All rights reserved.
  47. Redistribution and use in source and binary forms, with or without
  48. modification, are permitted provided that the following conditions are
  49. met:
  50. * Redistributions of source code must retain the above copyright notice,
  51. this list of conditions and the following disclaimer.
  52. * Redistributions in binary form must reproduce the above copyright
  53. notice, this list of conditions and the following disclaimer in the
  54. documentation and/or other materials provided with the distribution.
  55. * Neither the name "Markdown" nor the names of its contributors may
  56. be used to endorse or promote products derived from this software
  57. without specific prior written permission.
  58. This software is provided by the copyright holders and contributors "as
  59. is" and any express or implied warranties, including, but not limited
  60. to, the implied warranties of merchantability and fitness for a
  61. particular purpose are disclaimed. In no event shall the copyright owner
  62. or contributors be liable for any direct, indirect, incidental, special,
  63. exemplary, or consequential damages (including, but not limited to,
  64. procurement of substitute goods or services; loss of use, data, or
  65. profits; or business interruption) however caused and on any theory of
  66. liability, whether in contract, strict liability, or tort (including
  67. negligence or otherwise) arising in any way out of the use of this
  68. software, even if advised of the possibility of such damage.
  69. */
  70. #endregion
  71. using System;
  72. using System.Collections.Generic;
  73. using System.Configuration;
  74. using System.Text;
  75. using System.Text.RegularExpressions;
  76. namespace MarkdownSharp
  77. {
  78. public class MarkdownOptions
  79. {
  80. /// <summary>
  81. /// when true, (most) bare plain URLs are auto-hyperlinked
  82. /// WARNING: this is a significant deviation from the markdown spec
  83. /// </summary>
  84. public bool AutoHyperlink { get; set; }
  85. /// <summary>
  86. /// when true, RETURN becomes a literal newline
  87. /// WARNING: this is a significant deviation from the markdown spec
  88. /// </summary>
  89. public bool AutoNewlines { get; set; }
  90. /// <summary>
  91. /// use ">" for HTML output, or " />" for XHTML output
  92. /// </summary>
  93. public string EmptyElementSuffix { get; set; }
  94. /// <summary>
  95. /// when true, problematic URL characters like [, ], (, and so forth will be encoded
  96. /// WARNING: this is a significant deviation from the markdown spec
  97. /// </summary>
  98. public bool EncodeProblemUrlCharacters { get; set; }
  99. /// <summary>
  100. /// when false, email addresses will never be auto-linked
  101. /// WARNING: this is a significant deviation from the markdown spec
  102. /// </summary>
  103. public bool LinkEmails { get; set; }
  104. /// <summary>
  105. /// when true, bold and italic require non-word characters on either side
  106. /// WARNING: this is a significant deviation from the markdown spec
  107. /// </summary>
  108. public bool StrictBoldItalic { get; set; }
  109. }
  110. /// <summary>
  111. /// Markdown is a text-to-HTML conversion tool for web writers.
  112. /// Markdown allows you to write using an easy-to-read, easy-to-write plain text format,
  113. /// then convert it to structurally valid XHTML (or HTML).
  114. /// </summary>
  115. public class Markdown
  116. {
  117. //Mono's RegEx is very limited and can't support avg-sized Markdown documents
  118. public static bool UseMarkdownDeep = ServiceStack.Text.Env.IsMono || true;
  119. private const string _version = "1.13";
  120. #region Constructors and Options
  121. /// <summary>
  122. /// Create a new Markdown instance using default options
  123. /// </summary>
  124. public Markdown() : this(false)
  125. {
  126. }
  127. /// <summary>
  128. /// Create a new Markdown instance and optionally load options from a configuration
  129. /// file. There they should be stored in the appSettings section, available options are:
  130. ///
  131. /// Markdown.StrictBoldItalic (true/false)
  132. /// Markdown.EmptyElementSuffix (">" or " />" without the quotes)
  133. /// Markdown.LinkEmails (true/false)
  134. /// Markdown.AutoNewLines (true/false)
  135. /// Markdown.AutoHyperlink (true/false)
  136. /// Markdown.EncodeProblemUrlCharacters (true/false)
  137. ///
  138. /// </summary>
  139. public Markdown(bool loadOptionsFromConfigFile)
  140. {
  141. if (!loadOptionsFromConfigFile) return;
  142. var settings = ConfigurationManager.AppSettings;
  143. foreach (string key in settings.Keys)
  144. {
  145. switch (key)
  146. {
  147. case "Markdown.AutoHyperlink":
  148. _autoHyperlink = Convert.ToBoolean(settings[key]);
  149. break;
  150. case "Markdown.AutoNewlines":
  151. _autoNewlines = Convert.ToBoolean(settings[key]);
  152. break;
  153. case "Markdown.EmptyElementSuffix":
  154. _emptyElementSuffix = settings[key];
  155. break;
  156. case "Markdown.EncodeProblemUrlCharacters":
  157. _encodeProblemUrlCharacters = Convert.ToBoolean(settings[key]);
  158. break;
  159. case "Markdown.LinkEmails":
  160. _linkEmails = Convert.ToBoolean(settings[key]);
  161. break;
  162. case "Markdown.StrictBoldItalic":
  163. _strictBoldItalic = Convert.ToBoolean(settings[key]);
  164. break;
  165. }
  166. }
  167. }
  168. /// <summary>
  169. /// Create a new Markdown instance and set the options from the MarkdownOptions object.
  170. /// </summary>
  171. public Markdown(MarkdownOptions options)
  172. {
  173. _autoHyperlink = options.AutoHyperlink;
  174. _autoNewlines = options.AutoNewlines;
  175. _emptyElementSuffix = options.EmptyElementSuffix;
  176. _encodeProblemUrlCharacters = options.EncodeProblemUrlCharacters;
  177. _linkEmails = options.LinkEmails;
  178. _strictBoldItalic = options.StrictBoldItalic;
  179. }
  180. /// <summary>
  181. /// use ">" for HTML output, or " />" for XHTML output
  182. /// </summary>
  183. public string EmptyElementSuffix
  184. {
  185. get { return _emptyElementSuffix; }
  186. set { _emptyElementSuffix = value; }
  187. }
  188. private string _emptyElementSuffix = " />";
  189. /// <summary>
  190. /// when false, email addresses will never be auto-linked
  191. /// WARNING: this is a significant deviation from the markdown spec
  192. /// </summary>
  193. public bool LinkEmails
  194. {
  195. get { return _linkEmails; }
  196. set { _linkEmails = value; }
  197. }
  198. private bool _linkEmails = true;
  199. /// <summary>
  200. /// when true, bold and italic require non-word characters on either side
  201. /// WARNING: this is a significant deviation from the markdown spec
  202. /// </summary>
  203. public bool StrictBoldItalic
  204. {
  205. get { return _strictBoldItalic; }
  206. set { _strictBoldItalic = value; }
  207. }
  208. private bool _strictBoldItalic = false;
  209. /// <summary>
  210. /// when true, RETURN becomes a literal newline
  211. /// WARNING: this is a significant deviation from the markdown spec
  212. /// </summary>
  213. public bool AutoNewLines
  214. {
  215. get { return _autoNewlines; }
  216. set { _autoNewlines = value; }
  217. }
  218. private bool _autoNewlines = false;
  219. /// <summary>
  220. /// when true, (most) bare plain URLs are auto-hyperlinked
  221. /// WARNING: this is a significant deviation from the markdown spec
  222. /// </summary>
  223. public bool AutoHyperlink
  224. {
  225. get { return _autoHyperlink; }
  226. set { _autoHyperlink = value; }
  227. }
  228. private bool _autoHyperlink = false;
  229. /// <summary>
  230. /// when true, problematic URL characters like [, ], (, and so forth will be encoded
  231. /// WARNING: this is a significant deviation from the markdown spec
  232. /// </summary>
  233. public bool EncodeProblemUrlCharacters
  234. {
  235. get { return _encodeProblemUrlCharacters; }
  236. set { _encodeProblemUrlCharacters = value; }
  237. }
  238. private bool _encodeProblemUrlCharacters = false;
  239. #endregion
  240. private enum TokenType { Text, Tag }
  241. private struct Token
  242. {
  243. public Token(TokenType type, string value)
  244. {
  245. this.Type = type;
  246. this.Value = value;
  247. }
  248. public TokenType Type;
  249. public string Value;
  250. }
  251. /// <summary>
  252. /// maximum nested depth of [] and () supported by the transform; implementation detail
  253. /// </summary>
  254. private static int _nestDepth = ServiceStack.Text.Env.IsMono ? 1 : 6; //hangs on Mono
  255. /// <summary>
  256. /// Tabs are automatically converted to spaces as part of the transform
  257. /// this constant determines how "wide" those tabs become in spaces
  258. /// </summary>
  259. private const int _tabWidth = 4;
  260. private const string _markerUL = @"[*+-]";
  261. private const string _markerOL = @"\d+[.]";
  262. private static readonly Dictionary<string, string> _escapeTable;
  263. private static readonly Dictionary<string, string> _invertedEscapeTable;
  264. private static readonly Dictionary<string, string> _backslashEscapeTable;
  265. private readonly Dictionary<string, string> _urls = new Dictionary<string, string>();
  266. private readonly Dictionary<string, string> _titles = new Dictionary<string, string>();
  267. private readonly Dictionary<string, string> _htmlBlocks = new Dictionary<string, string>();
  268. private int _listLevel;
  269. /// <summary>
  270. /// In the static constuctor we'll initialize what stays the same across all transforms.
  271. /// </summary>
  272. static Markdown()
  273. {
  274. // Table of hash values for escaped characters:
  275. _escapeTable = new Dictionary<string, string>();
  276. _invertedEscapeTable = new Dictionary<string, string>();
  277. // Table of hash value for backslash escaped characters:
  278. _backslashEscapeTable = new Dictionary<string, string>();
  279. string backslashPattern = "";
  280. foreach (char c in @"\`*_{}[]()>#+-.!")
  281. {
  282. string key = c.ToString();
  283. string hash = GetHashKey(key, isHtmlBlock: false);
  284. _escapeTable.Add(key, hash);
  285. _invertedEscapeTable.Add(hash, key);
  286. _backslashEscapeTable.Add(@"\" + key, hash);
  287. backslashPattern += Regex.Escape(@"\" + key) + "|";
  288. }
  289. _backslashEscapes = new Regex(backslashPattern.Substring(0, backslashPattern.Length - 1), RegexOptions.Compiled);
  290. }
  291. /// <summary>
  292. /// current version of MarkdownSharp;
  293. /// see http://code.google.com/p/markdownsharp/ for the latest code or to contribute
  294. /// </summary>
  295. public string Version
  296. {
  297. get { return _version; }
  298. }
  299. MarkdownDeep.Markdown _markdownDeep;
  300. /// <summary>
  301. /// Transforms the provided Markdown-formatted text to HTML;
  302. /// see http://en.wikipedia.org/wiki/Markdown
  303. /// </summary>
  304. /// <remarks>
  305. /// The order in which other subs are called here is
  306. /// essential. Link and image substitutions need to happen before
  307. /// EscapeSpecialChars(), so that any *'s or _'s in the a
  308. /// and img tags get encoded.
  309. /// </remarks>
  310. public string Transform(string text)
  311. {
  312. if (String.IsNullOrEmpty(text)) return "";
  313. if (UseMarkdownDeep)
  314. {
  315. if (_markdownDeep == null)
  316. _markdownDeep = new MarkdownDeep.Markdown();
  317. return _markdownDeep.Transform(text);
  318. }
  319. Setup();
  320. text = Normalize(text);
  321. text = HashHTMLBlocks(text);
  322. text = StripLinkDefinitions(text);
  323. text = RunBlockGamut(text);
  324. text = Unescape(text);
  325. Cleanup();
  326. return text + "\n";
  327. }
  328. /// <summary>
  329. /// Perform transformations that form block-level tags like paragraphs, headers, and list items.
  330. /// </summary>
  331. private string RunBlockGamut(string text, bool unhash = true)
  332. {
  333. text = DoHeaders(text);
  334. text = DoHorizontalRules(text);
  335. text = DoLists(text);
  336. text = DoCodeBlocks(text);
  337. text = DoBlockQuotes(text);
  338. // We already ran HashHTMLBlocks() before, in Markdown(), but that
  339. // was to escape raw HTML in the original Markdown source. This time,
  340. // we're escaping the markup we've just created, so that we don't wrap
  341. // <p> tags around block-level tags.
  342. text = HashHTMLBlocks(text);
  343. text = FormParagraphs(text, unhash: unhash);
  344. return text;
  345. }
  346. /// <summary>
  347. /// Perform transformations that occur *within* block-level tags like paragraphs, headers, and list items.
  348. /// </summary>
  349. private string RunSpanGamut(string text)
  350. {
  351. text = DoCodeSpans(text);
  352. text = EscapeSpecialCharsWithinTagAttributes(text);
  353. text = EscapeBackslashes(text);
  354. // Images must come first, because ![foo][f] looks like an anchor.
  355. text = DoImages(text);
  356. text = DoAnchors(text);
  357. // Must come after DoAnchors(), because you can use < and >
  358. // delimiters in inline links like [this](<url>).
  359. text = DoAutoLinks(text);
  360. text = EncodeAmpsAndAngles(text);
  361. text = DoItalicsAndBold(text);
  362. text = DoHardBreaks(text);
  363. return text;
  364. }
  365. private static Regex _newlinesLeadingTrailing = new Regex(@"^\n+|\n+\z", RegexOptions.Compiled);
  366. private static Regex _newlinesMultiple = new Regex(@"\n{2,}", RegexOptions.Compiled);
  367. private static Regex _leadingWhitespace = new Regex(@"^[ ]*", RegexOptions.Compiled);
  368. private static Regex _htmlBlockHash = new Regex("\x1AH\\d+H", RegexOptions.Compiled);
  369. /// <summary>
  370. /// splits on two or more newlines, to form "paragraphs";
  371. /// each paragraph is then unhashed (if it is a hash and unhashing isn't turned off) or wrapped in HTML p tag
  372. /// </summary>
  373. private string FormParagraphs(string text, bool unhash = true)
  374. {
  375. // split on two or more newlines
  376. string[] grafs = _newlinesMultiple.Split(_newlinesLeadingTrailing.Replace(text, ""));
  377. for (int i = 0; i < grafs.Length; i++)
  378. {
  379. if (grafs[i].StartsWith("\x1AH"))
  380. {
  381. // unhashify HTML blocks
  382. if (unhash)
  383. {
  384. int sanityCheck = 50; // just for safety, guard against an infinite loop
  385. bool keepGoing = true; // as long as replacements where made, keep going
  386. while (keepGoing && sanityCheck > 0)
  387. {
  388. keepGoing = false;
  389. grafs[i] = _htmlBlockHash.Replace(grafs[i], match =>
  390. {
  391. keepGoing = true;
  392. return _htmlBlocks[match.Value];
  393. });
  394. sanityCheck--;
  395. }
  396. /* if (keepGoing)
  397. {
  398. // Logging of an infinite loop goes here.
  399. // If such a thing should happen, please open a new issue on http://code.google.com/p/markdownsharp/
  400. // with the input that caused it.
  401. }*/
  402. }
  403. }
  404. else
  405. {
  406. // do span level processing inside the block, then wrap result in <p> tags
  407. grafs[i] = _leadingWhitespace.Replace(RunSpanGamut(grafs[i]), "<p>") + "</p>";
  408. }
  409. }
  410. return string.Join("\n\n", grafs);
  411. }
  412. private void Setup()
  413. {
  414. // Clear the global hashes. If we don't clear these, you get conflicts
  415. // from other articles when generating a page which contains more than
  416. // one article (e.g. an index page that shows the N most recent
  417. // articles):
  418. _urls.Clear();
  419. _titles.Clear();
  420. _htmlBlocks.Clear();
  421. _listLevel = 0;
  422. }
  423. private void Cleanup()
  424. {
  425. Setup();
  426. }
  427. private static string _nestedBracketsPattern;
  428. /// <summary>
  429. /// Reusable pattern to match balanced [brackets]. See Friedl's
  430. /// "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  431. /// </summary>
  432. private static string GetNestedBracketsPattern()
  433. {
  434. // in other words [this] and [this[also]] and [this[also[too]]]
  435. // up to _nestDepth
  436. if (_nestedBracketsPattern == null)
  437. _nestedBracketsPattern =
  438. RepeatString(@"
  439. (?> # Atomic matching
  440. [^\[\]]+ # Anything other than brackets
  441. |
  442. \[
  443. ", _nestDepth) + RepeatString(
  444. @" \]
  445. )*"
  446. , _nestDepth);
  447. return _nestedBracketsPattern;
  448. }
  449. // private static string GetNestedBracketsPattern()
  450. // {
  451. // // in other words [this] and [this[also]] and [this[also[too]]]
  452. // // up to _nestDepth
  453. // if (_nestedBracketsPattern == null)
  454. // _nestedBracketsPattern =
  455. // RepeatString(@"(?>[^\[\]]+|\[", _nestDepth)
  456. // + RepeatString(@" \])*", _nestDepth);
  457. // return _nestedBracketsPattern;
  458. // }
  459. private static string _nestedParensPattern;
  460. /// <summary>
  461. /// Reusable pattern to match balanced (parens). See Friedl's
  462. /// "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  463. /// </summary>
  464. private static string GetNestedParensPattern()
  465. {
  466. // in other words (this) and (this(also)) and (this(also(too)))
  467. // up to _nestDepth
  468. if (_nestedParensPattern == null)
  469. _nestedParensPattern =
  470. RepeatString(@"
  471. (?> # Atomic matching
  472. [^()\s]+ # Anything other than parens or whitespace
  473. |
  474. \(
  475. ", _nestDepth) + RepeatString(
  476. @" \)
  477. )*"
  478. , _nestDepth);
  479. return _nestedParensPattern;
  480. }
  481. private static Regex _linkDef = new Regex(string.Format(@"
  482. ^[ ]{{0,{0}}}\[(.+)\]: # id = $1
  483. [ ]*
  484. \n? # maybe *one* newline
  485. [ ]*
  486. <?(\S+?)>? # url = $2
  487. [ ]*
  488. \n? # maybe one newline
  489. [ ]*
  490. (?:
  491. (?<=\s) # lookbehind for whitespace
  492. [""(]
  493. (.+?) # title = $3
  494. ["")]
  495. [ ]*
  496. )? # title is optional
  497. (?:\n+|\Z)", _tabWidth - 1), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  498. /// <summary>
  499. /// Strips link definitions from text, stores the URLs and titles in hash references.
  500. /// </summary>
  501. /// <remarks>
  502. /// ^[id]: url "optional title"
  503. /// </remarks>
  504. private string StripLinkDefinitions(string text)
  505. {
  506. return _linkDef.Replace(text, new MatchEvaluator(LinkEvaluator));
  507. }
  508. private string LinkEvaluator(Match match)
  509. {
  510. string linkID = match.Groups[1].Value.ToLowerInvariant();
  511. _urls[linkID] = EncodeAmpsAndAngles(match.Groups[2].Value);
  512. if (match.Groups[3] != null && match.Groups[3].Length > 0)
  513. _titles[linkID] = match.Groups[3].Value.Replace("\"", "&quot;");
  514. return "";
  515. }
  516. // compiling this monster regex results in worse performance. trust me.
  517. private static Regex _blocksHtml = new Regex(GetBlockPattern(), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
  518. /// <summary>
  519. /// derived pretty much verbatim from PHP Markdown
  520. /// </summary>
  521. private static string GetBlockPattern()
  522. {
  523. // Hashify HTML blocks:
  524. // We only want to do this for block-level HTML tags, such as headers,
  525. // lists, and tables. That's because we still want to wrap <p>s around
  526. // "paragraphs" that are wrapped in non-block-level tags, such as anchors,
  527. // phrase emphasis, and spans. The list of tags we're looking for is
  528. // hard-coded:
  529. //
  530. // * List "a" is made of tags which can be both inline or block-level.
  531. // These will be treated block-level when the start tag is alone on
  532. // its line, otherwise they're not matched here and will be taken as
  533. // inline later.
  534. // * List "b" is made of tags which are always block-level;
  535. //
  536. string blockTagsA = "ins|del";
  537. string blockTagsB = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|script|noscript|form|fieldset|iframe|math";
  538. // Regular expression for the content of a block tag.
  539. string attr = @"
  540. (?> # optional tag attributes
  541. \s # starts with whitespace
  542. (?>
  543. [^>""/]+ # text outside quotes
  544. |
  545. /+(?!>) # slash not followed by >
  546. |
  547. ""[^""]*"" # text inside double quotes (tolerate >)
  548. |
  549. '[^']*' # text inside single quotes (tolerate >)
  550. )*
  551. )?
  552. ";
  553. string content = RepeatString(@"
  554. (?>
  555. [^<]+ # content without tag
  556. |
  557. <\2 # nested opening tag
  558. " + attr + @" # attributes
  559. (?>
  560. />
  561. |
  562. >", _nestDepth) + // end of opening tag
  563. ".*?" + // last level nested tag content
  564. RepeatString(@"
  565. </\2\s*> # closing nested tag
  566. )
  567. |
  568. <(?!/\2\s*> # other tags with a different name
  569. )
  570. )*", _nestDepth);
  571. string content2 = content.Replace(@"\2", @"\3");
  572. // First, look for nested blocks, e.g.:
  573. // <div>
  574. // <div>
  575. // tags for inner block must be indented.
  576. // </div>
  577. // </div>
  578. //
  579. // The outermost tags must start at the left margin for this to match, and
  580. // the inner nested divs must be indented.
  581. // We need to do this before the next, more liberal match, because the next
  582. // match will start at the first `<div>` and stop at the first `</div>`.
  583. string pattern = @"
  584. (?>
  585. (?>
  586. (?<=\n) # Starting after a blank line
  587. | # or
  588. \A\n? # the beginning of the doc
  589. )
  590. ( # save in $1
  591. # Match from `\n<tag>` to `</tag>\n`, handling nested tags
  592. # in between.
  593. [ ]{0,$less_than_tab}
  594. <($block_tags_b_re) # start tag = $2
  595. $attr> # attributes followed by > and \n
  596. $content # content, support nesting
  597. </\2> # the matching end tag
  598. [ ]* # trailing spaces
  599. (?=\n+|\Z) # followed by a newline or end of document
  600. | # Special version for tags of group a.
  601. [ ]{0,$less_than_tab}
  602. <($block_tags_a_re) # start tag = $3
  603. $attr>[ ]*\n # attributes followed by >
  604. $content2 # content, support nesting
  605. </\3> # the matching end tag
  606. [ ]* # trailing spaces
  607. (?=\n+|\Z) # followed by a newline or end of document
  608. | # Special case just for <hr />. It was easier to make a special
  609. # case than to make the other regex more complicated.
  610. [ ]{0,$less_than_tab}
  611. <(hr) # start tag = $2
  612. $attr # attributes
  613. /?> # the matching end tag
  614. [ ]*
  615. (?=\n{2,}|\Z) # followed by a blank line or end of document
  616. | # Special case for standalone HTML comments:
  617. [ ]{0,$less_than_tab}
  618. (?s:
  619. <!-- .*? -->
  620. )
  621. [ ]*
  622. (?=\n{2,}|\Z) # followed by a blank line or end of document
  623. | # PHP and ASP-style processor instructions (<? and <%)
  624. [ ]{0,$less_than_tab}
  625. (?s:
  626. <([?%]) # $2
  627. .*?
  628. \2>
  629. )
  630. [ ]*
  631. (?=\n{2,}|\Z) # followed by a blank line or end of document
  632. )
  633. )";
  634. pattern = pattern.Replace("$less_than_tab", (_tabWidth - 1).ToString());
  635. pattern = pattern.Replace("$block_tags_b_re", blockTagsB);
  636. pattern = pattern.Replace("$block_tags_a_re", blockTagsA);
  637. pattern = pattern.Replace("$attr", attr);
  638. pattern = pattern.Replace("$content2", content2);
  639. pattern = pattern.Replace("$content", content);
  640. return pattern;
  641. }
  642. /// <summary>
  643. /// replaces any block-level HTML blocks with hash entries
  644. /// </summary>
  645. private string HashHTMLBlocks(string text)
  646. {
  647. return _blocksHtml.Replace(text, new MatchEvaluator(HtmlEvaluator));
  648. }
  649. private string HtmlEvaluator(Match match)
  650. {
  651. string text = match.Groups[1].Value;
  652. string key = GetHashKey(text, isHtmlBlock: true);
  653. _htmlBlocks[key] = text;
  654. return string.Concat("\n\n", key, "\n\n");
  655. }
  656. private static string GetHashKey(string s, bool isHtmlBlock)
  657. {
  658. var delim = isHtmlBlock ? 'H' : 'E';
  659. return "\x1A" + delim + Math.Abs(s.GetHashCode()).ToString() + delim;
  660. }
  661. private static Regex _htmlTokens = new Regex(@"
  662. (<!(?:--.*?--\s*)+>)| # match <!-- foo -->
  663. (<\?.*?\?>)| # match <?foo?> " +
  664. RepeatString(@"
  665. (<[A-Za-z\/!$](?:[^<>]|", _nestDepth) + RepeatString(@")*>)", _nestDepth) +
  666. " # match <tag> and </tag>",
  667. RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  668. /// <summary>
  669. /// returns an array of HTML tokens comprising the input string. Each token is
  670. /// either a tag (possibly with nested, tags contained therein, such
  671. /// as &lt;a href="&lt;MTFoo&gt;"&gt;, or a run of text between tags. Each element of the
  672. /// array is a two-element array; the first is either 'tag' or 'text'; the second is
  673. /// the actual value.
  674. /// </summary>
  675. private List<Token> TokenizeHTML(string text)
  676. {
  677. int pos = 0;
  678. int tagStart = 0;
  679. var tokens = new List<Token>();
  680. // this regex is derived from the _tokenize() subroutine in Brad Choate's MTRegex plugin.
  681. // http://www.bradchoate.com/past/mtregex.php
  682. foreach (Match m in _htmlTokens.Matches(text))
  683. {
  684. tagStart = m.Index;
  685. if (pos < tagStart)
  686. tokens.Add(new Token(TokenType.Text, text.Substring(pos, tagStart - pos)));
  687. tokens.Add(new Token(TokenType.Tag, m.Value));
  688. pos = tagStart + m.Length;
  689. }
  690. if (pos < text.Length)
  691. tokens.Add(new Token(TokenType.Text, text.Substring(pos, text.Length - pos)));
  692. return tokens;
  693. }
  694. // private static Regex _anchorRef = new Regex(string.Format(@"
  695. // ( # wrap whole match in $1
  696. // \[
  697. // ({0}) # link text = $2
  698. // \]
  699. //
  700. // [ ]? # one optional space
  701. // (?:\n[ ]*)? # one optional newline followed by spaces
  702. //
  703. // \[
  704. // (.*?) # id = $3
  705. // \]
  706. // )", GetNestedBracketsPattern()), RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  707. private static Regex _anchorRef = new Regex(string.Format(
  708. @"(\[({0})\][ ]?(?:\n[ ]*)?\[(.*?)\])",
  709. GetNestedBracketsPattern()), RegexOptions.Singleline | RegexOptions.Compiled);
  710. private static Regex _anchorInline = new Regex(string.Format(@"
  711. ( # wrap whole match in $1
  712. \[
  713. ({0}) # link text = $2
  714. \]
  715. \( # literal paren
  716. [ ]*
  717. ({1}) # href = $3
  718. [ ]*
  719. ( # $4
  720. (['""]) # quote char = $5
  721. (.*?) # title = $6
  722. \5 # matching quote
  723. [ ]* # ignore any spaces between closing quote and )
  724. )? # title is optional
  725. \)
  726. )", GetNestedBracketsPattern(), GetNestedParensPattern()),
  727. RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  728. private static Regex _anchorRefShortcut = new Regex(@"
  729. ( # wrap whole match in $1
  730. \[
  731. ([^\[\]]+) # link text = $2; can't contain [ or ]
  732. \]
  733. )", RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  734. /// <summary>
  735. /// Turn Markdown link shortcuts into HTML anchor tags
  736. /// </summary>
  737. /// <remarks>
  738. /// [link text](url "title")
  739. /// [link text][id]
  740. /// [id]
  741. /// </remarks>
  742. private string DoAnchors(string text)
  743. {
  744. // First, handle reference-style links: [link text] [id]
  745. text = _anchorRef.Replace(text, new MatchEvaluator(AnchorRefEvaluator));
  746. // Next, inline-style links: [link text](url "optional title") or [link text](url "optional title")
  747. text = _anchorInline.Replace(text, new MatchEvaluator(AnchorInlineEvaluator));
  748. // Last, handle reference-style shortcuts: [link text]
  749. // These must come last in case you've also got [link test][1]
  750. // or [link test](/foo)
  751. text = _anchorRefShortcut.Replace(text, new MatchEvaluator(AnchorRefShortcutEvaluator));
  752. return text;
  753. }
  754. private string AnchorRefEvaluator(Match match)
  755. {
  756. string wholeMatch = match.Groups[1].Value;
  757. string linkText = match.Groups[2].Value;
  758. string linkID = match.Groups[3].Value.ToLowerInvariant();
  759. string result;
  760. // for shortcut links like [this][].
  761. if (linkID == "")
  762. linkID = linkText.ToLowerInvariant();
  763. if (_urls.ContainsKey(linkID))
  764. {
  765. string url = _urls[linkID];
  766. url = EncodeProblemUrlChars(url);
  767. url = EscapeBoldItalic(url);
  768. result = "<a href=\"" + url + "\"";
  769. if (_titles.ContainsKey(linkID))
  770. {
  771. string title = _titles[linkID];
  772. title = EscapeBoldItalic(title);
  773. result += " title=\"" + title + "\"";
  774. }
  775. result += ">" + linkText + "</a>";
  776. }
  777. else
  778. result = wholeMatch;
  779. return result;
  780. }
  781. private string AnchorRefShortcutEvaluator(Match match)
  782. {
  783. string wholeMatch = match.Groups[1].Value;
  784. string linkText = match.Groups[2].Value;
  785. string linkID = Regex.Replace(linkText.ToLowerInvariant(), @"[ ]*\n[ ]*", " "); // lower case and remove newlines / extra spaces
  786. string result;
  787. if (_urls.ContainsKey(linkID))
  788. {
  789. string url = _urls[linkID];
  790. url = EncodeProblemUrlChars(url);
  791. url = EscapeBoldItalic(url);
  792. result = "<a href=\"" + url + "\"";
  793. if (_titles.ContainsKey(linkID))
  794. {
  795. string title = _titles[linkID];
  796. title = EscapeBoldItalic(title);
  797. result += " title=\"" + title + "\"";
  798. }
  799. result += ">" + linkText + "</a>";
  800. }
  801. else
  802. result = wholeMatch;
  803. return result;
  804. }
  805. private string AnchorInlineEvaluator(Match match)
  806. {
  807. string linkText = match.Groups[2].Value;
  808. string url = match.Groups[3].Value;
  809. string title = match.Groups[6].Value;
  810. string result;
  811. url = EncodeProblemUrlChars(url);
  812. url = EscapeBoldItalic(url);
  813. if (url.StartsWith("<") && url.EndsWith(">"))
  814. url = url.Substring(1, url.Length - 2); // remove <>'s surrounding URL, if present
  815. result = string.Format("<a href=\"{0}\"", url);
  816. if (!String.IsNullOrEmpty(title))
  817. {
  818. title = title.Replace("\"", "&quot;");
  819. title = EscapeBoldItalic(title);
  820. result += string.Format(" title=\"{0}\"", title);
  821. }
  822. result += string.Format(">{0}</a>", linkText);
  823. return result;
  824. }
  825. private static Regex _imagesRef = new Regex(@"
  826. ( # wrap whole match in $1
  827. !\[
  828. (.*?) # alt text = $2
  829. \]
  830. [ ]? # one optional space
  831. (?:\n[ ]*)? # one optional newline followed by spaces
  832. \[
  833. (.*?) # id = $3
  834. \]
  835. )", RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
  836. private static Regex _imagesInline = new Regex(String.Format(@"
  837. ( # wrap whole match in $1
  838. !\[
  839. (.*?) # alt text = $2
  840. \]
  841. \s? # one optional whitespace character
  842. \( # literal paren
  843. [ ]*
  844. ({0}) # href = $3
  845. [ ]*
  846. ( # $4
  847. (['""]) # quote char = $5
  848. (.*?) # title = $6
  849. \5 # matching quote
  850. [ ]*
  851. )? # title is optional
  852. \)
  853. )", GetNestedParensPattern()),
  854. RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline | RegexOptions.Compiled);
  855. /// <summary>
  856. /// Turn Markdown image shortcuts into HTML img tags.
  857. /// </summary>
  858. /// <remarks>
  859. /// ![alt text][id]
  860. /// ![alt text](url "optional title")
  861. /// </remarks>
  862. private string DoImages(string text)
  863. {
  864. // First, handle reference-style labeled images: ![alt text][id]
  865. text = _imagesRef.Replace(text, new MatchEvaluator(ImageReferenceEvaluator));
  866. // Next, handle inline images: ![alt text](url "optional title")
  867. // Don't forget: encode * and _
  868. text = _imagesInline.Replace(text, new MatchEvaluator(ImageInlineEvaluator));
  869. return text;
  870. }
  871. private string ImageReferenceEvaluator(Match match)
  872. {
  873. string wholeMatch = match.Groups[1].Value;
  874. string altText = match.Groups[2].Value;
  875. string linkID = match.Groups[3].Value.ToLowerInvariant();
  876. string result;
  877. // for shortcut links like ![this][].
  878. if (linkID == "")
  879. linkID = altText.ToLowerInvariant();
  880. altText = altText.Replace("\"", "&quot;");
  881. if (_urls.ContainsKey(linkID))
  882. {
  883. string url = _urls[linkID];
  884. url = EncodeProblemUrlChars(url);
  885. url = EscapeBoldItalic(url);
  886. result = string.Format("<img src=\"{0}\" alt=\"{1}\"", url, altText);
  887. if (_titles.ContainsKey(linkID))
  888. {
  889. string title = _titles[linkID];
  890. title = EscapeBoldItalic(title);
  891. result += string.Format(" title=\"{0}\"", title);
  892. }
  893. result += _emptyElementSuffix;
  894. }
  895. else
  896. {
  897. // If there's no such link ID, leave intact:
  898. result = wholeMatch;
  899. }
  900. return result;
  901. }
  902. private string ImageInlineEvaluator(Match match)
  903. {
  904. string alt = match.Groups[2].Value;
  905. string url = match.Groups[3].Value;
  906. string title = match.Groups[6].Value;
  907. string result;
  908. alt = alt.Replace("\"", "&quot;");
  909. title = title.Replace("\"", "&quot;");
  910. if (url.StartsWith("<") && url.EndsWith(">"))
  911. url = url.Substring(1, url.Length - 2); // Remove <>'s surrounding URL, if present
  912. url = EncodeProblemUrlChars(url);
  913. url = EscapeBoldItalic(url);
  914. result = string.Format("<img src=\"{0}\" alt=\"{1}\"", url, alt);
  915. if (!String.IsNullOrEmpty(title))
  916. {
  917. title = EscapeBoldItalic(title);
  918. result += string.Format(" title=\"{0}\"", title);
  919. }
  920. result += _emptyElementSuffix;
  921. return result;
  922. }
  923. private static Regex _headerSetext = new Regex(@"
  924. ^(.+?)
  925. [ ]*
  926. \n
  927. (=+|-+) # $1 = string of ='s or -'s
  928. [ ]*
  929. \n+",
  930. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  931. private static Regex _headerAtx = new Regex(@"
  932. ^(\#{1,6}) # $1 = string of #'s
  933. [ ]*
  934. (.+?) # $2 = Header text
  935. [ ]*
  936. \#* # optional closing #'s (not counted)
  937. \n+",
  938. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  939. /// <summary>
  940. /// Turn Markdown headers into HTML header tags
  941. /// </summary>
  942. /// <remarks>
  943. /// Header 1
  944. /// ========
  945. ///
  946. /// Header 2
  947. /// --------
  948. ///
  949. /// # Header 1
  950. /// ## Header 2
  951. /// ## Header 2 with closing hashes ##
  952. /// ...
  953. /// ###### Header 6
  954. /// </remarks>
  955. private string DoHeaders(string text)
  956. {
  957. text = _headerSetext.Replace(text, new MatchEvaluator(SetextHeaderEvaluator));
  958. text = _headerAtx.Replace(text, new MatchEvaluator(AtxHeaderEvaluator));
  959. return text;
  960. }
  961. private string SetextHeaderEvaluator(Match match)
  962. {
  963. string header = match.Groups[1].Value;
  964. int level = match.Groups[2].Value.StartsWith("=") ? 1 : 2;
  965. return string.Format("<h{1}>{0}</h{1}>\n\n", RunSpanGamut(header), level);
  966. }
  967. private string AtxHeaderEvaluator(Match match)
  968. {
  969. string header = match.Groups[2].Value;
  970. int level = match.Groups[1].Value.Length;
  971. return string.Format("<h{1}>{0}</h{1}>\n\n", RunSpanGamut(header), level);
  972. }
  973. private static Regex _horizontalRules = new Regex(@"
  974. ^[ ]{0,3} # Leading space
  975. ([-*_]) # $1: First marker
  976. (?> # Repeated marker group
  977. [ ]{0,2} # Zero, one, or two spaces.
  978. \1 # Marker character
  979. ){2,} # Group repeated at least twice
  980. [ ]* # Trailing spaces
  981. $ # End of line.
  982. ", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  983. /// <summary>
  984. /// Turn Markdown horizontal rules into HTML hr tags
  985. /// </summary>
  986. /// <remarks>
  987. /// ***
  988. /// * * *
  989. /// ---
  990. /// - - -
  991. /// </remarks>
  992. private string DoHorizontalRules(string text)
  993. {
  994. return _horizontalRules.Replace(text, "<hr" + _emptyElementSuffix + "\n");
  995. }
  996. private static string _wholeList = string.Format(@"
  997. ( # $1 = whole list
  998. ( # $2
  999. [ ]{{0,{1}}}
  1000. ({0}) # $3 = first list item marker
  1001. [ ]+
  1002. )
  1003. (?s:.+?)
  1004. ( # $4
  1005. \z
  1006. |
  1007. \n{{2,}}
  1008. (?=\S)
  1009. (?! # Negative lookahead for another list item marker
  1010. [ ]*
  1011. {0}[ ]+
  1012. )
  1013. )
  1014. )", string.Format("(?:{0}|{1})", _markerUL, _markerOL), _tabWidth - 1);
  1015. private static Regex _listNested = new Regex(@"^" + _wholeList,
  1016. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  1017. private static Regex _listTopLevel = new Regex(@"(?:(?<=\n\n)|\A\n?)" + _wholeList,
  1018. RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled);
  1019. /// <summary>
  1020. /// Turn Markdown lists into HTML ul and ol and li tags
  1021. /// </summary>
  1022. private string DoLists(string text)
  1023. {
  1024. // We use a different prefix before nested lists than top-level lists.
  1025. // See extended comment in _ProcessListItems().
  1026. if (_listLevel > 0)
  1027. text = _listNested.Replace(text, new MatchEvaluator(ListEvaluator));
  1028. else
  1029. text = _listTopLevel.Replace(text, new MatchEvaluator(ListEvaluator));
  1030. return text;
  1031. }
  1032. private string ListEvaluator(Match match)
  1033. {
  1034. string list = match.Groups[1].Value;
  1035. string listType = Regex.IsMatch(match.Groups[3].Value, _markerUL) ? "ul" : "ol";
  1036. string result;
  1037. result = ProcessListItems(list, listType == "ul" ? _markerUL : _markerOL);
  1038. result = string.Format("<{0}>\n{1}</{0}>\n", listType, result);
  1039. return result;
  1040. }
  1041. /// <summary>
  1042. /// Process the contents of a single ordered or unordered list, splitting it
  1043. /// into individual list items.
  1044. /// </summary>
  1045. private string ProcessListItems(string list, string marker)
  1046. {
  1047. // The listLevel global keeps track of when we're inside a list.
  1048. // Each time we enter a list, we increment it; when we leave a list,
  1049. // we decrement. If it's zero, we're not in a list anymore.
  1050. // We do this because when we're not inside a list, we want to treat
  1051. // something like this:
  1052. // I recommend upgrading to version
  1053. // 8. Oops, now this line is treated
  1054. // as a sub-list.
  1055. // As a single paragraph, despite the fact that the second line starts
  1056. // with a digit-period-space sequence.
  1057. // Whereas when we're inside a list (or sub-list), that line will be
  1058. // treated as the start of a sub-list. What a kludge, huh? This is
  1059. // an aspect of Markdown's syntax that's hard to parse perfectly
  1060. // without resorting to mind-reading. Perhaps the solution is to
  1061. // change the syntax rules such that sub-lists must start with a
  1062. // starting cardinal number; e.g. "1." or "a.".
  1063. _listLevel++;
  1064. // Trim trailing blank lines:
  1065. list = Regex.Replace(list, @"\n{2,}\z", "\n");
  1066. string pattern = string.Format(
  1067. @"(^[ ]*) # leading whitespace = $1
  1068. ({0}) [ ]+ # list marker = $2
  1069. ((?s:.+?) # list item text = $3
  1070. (\n+))
  1071. (?= (\z | \1 ({0}) [ ]+))", marker);
  1072. bool lastItemHadADoubleNewline = false;
  1073. // has to be a closure, so subsequent invocations can share the bool
  1074. MatchEvaluator ListItemEvaluator = (Match match) =>
  1075. {
  1076. string item = match.Groups[3].Value;
  1077. bool endsWithDoubleNewline = item.EndsWith("\n\n");
  1078. bool containsDoubleNewline = endsWithDoubleNewline || item.Contains("\n\n");
  1079. if (containsDoubleNewline || lastItemHadADoubleNewline)
  1080. // we could correct any bad indentation here..
  1081. item = RunBlockGamut(Outdent(item) + "\n", unhash: false);
  1082. else

Large files files are truncated, but you can click here to view the full file