PageRenderTime 59ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/src/markdown/markdown.erl

https://code.google.com/p/zotonic/
Erlang | 1285 lines | 905 code | 140 blank | 240 comment | 11 complexity | 0e866a9d70f6b396392968721c4397fc MD5 | raw file
Possible License(s): Apache-2.0, MIT, LGPL-2.1, BSD-3-Clause
  1. %%%-------------------------------------------------------------------
  2. %%% @author Gordon Guthrie
  3. %%% @copyright (C) 2009, Gordon Guthrie
  4. %%% @doc,
  5. %%%
  6. %%% @end
  7. %%% Created : 10 Sep 2009 by gordonguthrie@backawinner.gg
  8. %%%-------------------------------------------------------------------
  9. -module(markdown).
  10. -export([conv/1,
  11. conv_utf8/1,
  12. conv_file/2]).
  13. -import(lists, [flatten/1, reverse/1]).
  14. -include_lib("eunit/include/eunit.hrl").
  15. -define(SPACE, 32).
  16. -define(TAB, 9).
  17. -define(LF, 10).
  18. -define(CR, 13).
  19. -define(NBSP, 160).
  20. -define(AMP, $&, $a, $m, $p, $;).
  21. -define(COPY, $&, $c, $o, $p, $y, $;).
  22. %%% the lexer first lexes the input
  23. %%% make_lines does 2 passes:
  24. %%% * it chops the lexed strings into lines which it represents as a
  25. %%% list of lists
  26. %%% * it then types the lines into the following:
  27. %%% * normal lines
  28. %%% * reference style links
  29. %%% * reference style images
  30. %%% * special line types
  31. %%% - blank
  32. %%% - SETEXT header lines
  33. %%% - ATX header lines
  34. %%% - blockquote
  35. %%% - unordered lists
  36. %%% - ordered lists
  37. %%% - code blocks
  38. %%% - horizontal rules
  39. %%% the parser then does its magic interpolating the references as appropriate
  40. conv(String) -> Lex = lex(String),
  41. % io:format("Lex is ~p~n", [Lex]),
  42. UntypedLines = make_lines(Lex),
  43. % io:format("UntypedLines are ~p~n", [UntypedLines]),
  44. {TypedLines, Refs} = type_lines(UntypedLines),
  45. % io:format("TypedLines are ~p~nRefs is ~p~n",
  46. % [TypedLines, Refs]),
  47. parse(TypedLines, Refs).
  48. -spec conv_utf8(list()) -> list().
  49. conv_utf8(Utf8) ->
  50. Str = xmerl_ucs:from_utf8(Utf8),
  51. Res = conv(Str),
  52. xmerl_ucs:to_utf8(Res).
  53. conv_file(FileIn, FileOut) ->
  54. case file:open(FileIn, [read]) of
  55. {ok, Device} -> Input = get_all_lines(Device,[]),
  56. Output = conv(Input),
  57. write(FileOut, Output);
  58. _ -> error
  59. end.
  60. get_all_lines(Device, Accum) ->
  61. case io:get_line(Device,"") of
  62. eof -> file:close(Device),
  63. Accum;
  64. Line ->
  65. get_all_lines(Device,Accum ++ Line)
  66. end.
  67. write(File, Text) ->
  68. _Return=filelib:ensure_dir(File),
  69. case file:open(File, [write]) of
  70. {ok, Id} ->
  71. io:fwrite(Id, "~s~n", [Text]),
  72. file:close(Id);
  73. _ ->
  74. error
  75. end.
  76. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  77. %%%
  78. %%% Parse the lines interpolating the references as appropriate
  79. %%%
  80. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  81. parse(TypedLines, Refs) ->
  82. string:strip(p1(TypedLines, Refs, 0, []), both, $\n).
  83. %% goes through the lines
  84. %% Variable 'R' contains the References and 'I' is the indent level
  85. %% Terminal clause
  86. p1([], _R, _I, Acc) -> flatten(reverse(Acc));
  87. %% Tags have the highest precedence...
  88. p1([{tag, Tag} | T], R, I, Acc) ->
  89. case T of
  90. [] -> p1([], R, I,
  91. ["</p>", make_tag_str(Tag, R), "<p>" | Acc]);
  92. [{blank, _} | T2] -> p1(T2, R, I,
  93. [make_tag_str(Tag, R) | Acc]);
  94. _Other -> p1(T, R, I,
  95. [pad(I) ++ make_tag_str(Tag, R) | Acc])
  96. end;
  97. p1([{blocktag, [{{{tag, open}, Type}, Tg}] = _Tag} | T], R, I, Acc) ->
  98. {Block, Rest} = grab_for_blockhtml(T, Type, []),
  99. Str = lists:flatten([Tg, "\n" | Block]),
  100. p1(Rest, R, I, [Str | Acc]);
  101. %% blank lines/linefeeds are gobbled down
  102. p1([{Type, _} | T], R, I, Acc)
  103. when Type == blank orelse Type == linefeed ->
  104. Rest = grab_empties(T),
  105. p1(Rest, R, I, [pad(I) ++ "\n" | Acc]);
  106. %% two consecutive normal lines should be concatenated...
  107. %% remembering the pad the second line with the indent...
  108. p1([{normal, P1}, {normal, P2} | T], R, I, Acc) ->
  109. p1([{normal, merge(P1, pad(I), P2)} | T], R, I, Acc);
  110. %% as should a normal and linefeed
  111. %% setext h1 is a look behind and it overrides blockquote and code...
  112. p1([{normal, P}, {setext_h1, _} | T], R, I, Acc) ->
  113. p1(T, R, I, [pad(I) ++ "<h1>" ++ make_str(snip(P), R)
  114. ++ "</h1>\n\n" | Acc]);
  115. p1([{blockquote, P}, {setext_h1, _} | T], R, I, Acc) ->
  116. p1(T, R, I, [pad(I) ++ "<h1>" ++ make_str(snip(P), R)
  117. ++ "</h1>\n\n" | Acc]);
  118. p1([{{codeblock, P}, _}, {setext_h1, _} | T], R, I, Acc) ->
  119. p1(T, R, I, [pad(I) ++ "<h1>" ++ make_str(snip(P), R)
  120. ++ "</h1>\n\n" | Acc]);
  121. p1([{blockquote, P}, {h2_or_hr, _} | T], R, I, Acc) ->
  122. p1(T, R, I, [pad(I) ++ "<h2>" ++ make_str(snip(P), R)
  123. ++ "</h2>\n\n" | Acc]);
  124. p1([{{codeblock, P}, _}, {h2_or_hr, _} | T], R, I, Acc) ->
  125. p1(T, R, I, [pad(I) ++ "<h2>" ++ make_str(snip(P), R)
  126. ++ "</h2>\n\n" | Acc]);
  127. %% but a setext with no lookbehind is just rendered as a normal line,
  128. %% so change its type and rethrow it
  129. p1([{setext_h1, P} | T], R, I, Acc) ->
  130. p1([{normal, P} | T], R, I, Acc);
  131. %% setext h2 might be a look behind
  132. p1([{normal, P}, {h2_or_hr, _} | T], R, I, Acc) ->
  133. P2 = string:strip(make_str(snip(P), R), both, ?SPACE),
  134. p1(T, R, I, [pad(I) ++ "<h2>" ++ P2 ++ "</h2>\n\n" | Acc]);
  135. %% blockquotes swallow each other
  136. %% replace the first blockquote mark with a space...
  137. p1([{blockquote, P1}, {blockquote, [_ | P2]} | T], R, I, Acc) ->
  138. p1([{blockquote, merge(P1, pad(I), [{{ws, sp}, " "} | P2])} | T], R, I, Acc);
  139. %% blockquotes swallow normal
  140. p1([{blockquote, P1}, {normal, P2} | T], R, I, Acc) ->
  141. p1([{blockquote, merge(P1, pad(I + 1), P2)} | T], R, I, Acc);
  142. %% blockquote
  143. p1([{blockquote, P} | T], R, I, Acc) ->
  144. [{{md, gt}, _} | T1] = P,
  145. T2 = string:strip(make_str(T1, R)),
  146. p1(T, R, I,
  147. ["\n<blockquote>\n" ++ pad(I + 1) ++ "<p>" ++ T2 ++ "</p>\n</blockquote>" | Acc]);
  148. %% one normal is just normal...
  149. p1([{normal, P} | T], R, I, Acc) ->
  150. P2 = string:strip(make_str(snip(P), R), both, ?SPACE),
  151. p1(T, R, I, [pad(I) ++ "<p>" ++ P2 ++ "</p>\n" | Acc]);
  152. %% atx headings
  153. p1([{{h1, P}, _} | T], R, I, Acc) ->
  154. NewP = string:strip(make_str(snip(P), R), right),
  155. p1(T, R, I, [pad(I) ++ "<h1>" ++ NewP ++ "</h1>\n\n" | Acc]);
  156. p1([{{h2, P}, _} | T], R, I, Acc) ->
  157. NewP = string:strip(make_str(snip(P), R), right),
  158. p1(T, R, I, [pad(I) ++ "<h2>" ++ NewP ++ "</h2>\n\n" | Acc]);
  159. p1([{{h3, P}, _} | T], R, I, Acc) ->
  160. NewP = string:strip(make_str(snip(P), R), right),
  161. p1(T, R, I, [pad(I) ++ "<h3>" ++ NewP ++ "</h3>\n\n" | Acc]);
  162. p1([{{h4, P}, _} | T], R, I, Acc) ->
  163. NewP = string:strip(make_str(snip(P), R), right),
  164. p1(T, R, I, [pad(I) ++ "<h4>" ++ NewP ++ "</h4>\n\n" | Acc]);
  165. p1([{{h5, P}, _} | T], R, I, Acc) ->
  166. NewP = string:strip(make_str(snip(P), R), right),
  167. p1(T, R, I, [pad(I) ++ "<h5>" ++ NewP ++ "</h5>\n\n" | Acc]);
  168. p1([{{h6, P}, _} | T], R, I, Acc) ->
  169. NewP = string:strip(make_str(snip(P), R), right),
  170. p1(T, R, I, [pad(I) ++ "<h6>" ++ NewP ++ "</h6>\n\n" | Acc]);
  171. %% unordered lists swallow normal and codeblock lines
  172. p1([{{ul, P1}, S1}, {{normal, P2}, S2} | T], R, I , Acc) ->
  173. p1([{{ul, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc);
  174. p1([{{ul, P1}, S1}, {{codeblock, P2}, S2} | T], R, I , Acc) ->
  175. p1([{{ul, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc);
  176. p1([{{ul, _P}, _} | _T] = List, R, I, Acc) ->
  177. {Rest, NewAcc} = parse_list(ul, List, R, I, [], false),
  178. p1(Rest, R, I, [pad(I) ++ "<ul>\n" ++ NewAcc
  179. ++ pad(I) ++ "</ul>\n" | Acc]);
  180. %% ordered lists swallow normal and codeblock lines
  181. p1([{{ol, P1}, S1}, {{normal, P2}, S2} | T], R, I , Acc) ->
  182. p1([{{ol, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc);
  183. p1([{{ol, P1}, S1}, {{codeblock, P2}, S2} | T], R, I , Acc) ->
  184. p1([{{ol, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc);
  185. p1([{{ol, _P}, _} | _T] = List, R, I, Acc) ->
  186. {Rest, NewAcc} = parse_list(ol, List, R, I, [], false),
  187. p1(Rest, R, I, [pad(I) ++ "<ol>\n" ++ NewAcc
  188. ++ pad(I) ++ "</ol>\n" | Acc]);
  189. %% codeblock consumes any following empty lines
  190. %% and other codeblocks
  191. p1([{{codeblock, P1}, S1}, {{codeblock, P2}, S2} | T], R, I, Acc) ->
  192. p1([{{codeblock, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc);
  193. p1([{{codeblock, P}, _} | T], R, I, Acc) ->
  194. Rest = grab_empties(T),
  195. p1(Rest, R, I, ["<pre><code>" ++ make_str(snip(P), R)
  196. ++ "\n</code></pre>\n\n" | Acc]);
  197. %% horizontal rules
  198. p1([{hr, _} | T], R, I, Acc) ->
  199. p1(T, R, I, ["<hr />" | Acc]);
  200. %% h2_or_hr is greedy for normal lines
  201. p1([{h2_or_hr, P1}, {normal, P2} | T], R, I, Acc) ->
  202. p1([{normal, flatten([P1 | P2])} | T], R, I, Acc);
  203. %% the clause with a normal before an 'h2_or_hr' has already been
  204. %% handled further up the tree, so this is a bona fide 'hr'...
  205. p1([{h2_or_hr, _} | T], R, I, Acc) ->
  206. p1(T, R, I, ["<hr />" | Acc]);
  207. %% Now start pulling out inline refs etc, etc
  208. p1([{inlineref, _P} | T], R, I, Acc) ->
  209. p1(T, R, I, Acc).
  210. grab_for_blockhtml([], Type, Acc) ->
  211. {lists:reverse(["</" ++ Type ++ ">" | Acc]), []};
  212. grab_for_blockhtml([{blocktag, [{{{tag, close}, Type}, Tg}]}
  213. | T], Type, Acc) ->
  214. {lists:reverse([Tg | Acc]), T};
  215. grab_for_blockhtml([{blocktag, [{{{tag, _}, GrabType}, Tg}]}
  216. | T], Type, Acc) when GrabType =/= Type ->
  217. % blocktags grabbed in a blocktag need a line ending pushed
  218. grab_for_blockhtml(T, Type, ["\n", Tg | Acc]);
  219. grab_for_blockhtml([{tag, {{{tag, self_closing}, _Ty}, Tg}}
  220. | T], Type, Acc) ->
  221. grab_for_blockhtml(T, Type, [Tg | Acc]);
  222. grab_for_blockhtml([H | T], Type, Acc) ->
  223. {_Type, Content} = H,
  224. Str = make_plain_str(Content),
  225. grab_for_blockhtml(T, Type, [Str | Acc]).
  226. grab_empties([{linefeed, _} | T]) -> grab_empties(T);
  227. grab_empties([{blank, _} | T]) -> grab_empties(T);
  228. grab_empties(List) -> List.
  229. merge(P1, Pad, P2) ->
  230. NewP1 = make_br(P1),
  231. flatten([NewP1, {string, Pad} | P2]).
  232. make_br(List) -> make_br1(reverse(List)).
  233. make_br1([{{lf, _}, _},
  234. {{ws, comp}, _} | T]) -> reverse([{tags, " <br />\n"} | T]);
  235. make_br1([{{lf, _}, _},
  236. {{ws, tab}, _} | T]) -> reverse([{tags, " <br />\n"} | T]);
  237. make_br1(List) -> reverse(List).
  238. pad(N) -> pad1(N, []).
  239. pad1(0, Acc) -> Acc;
  240. pad1(N, Acc) when N > 0 -> pad1(N - 1, [" " | Acc]).
  241. %% this is a bit messy because of the way that hard lines are treated...
  242. %% If your li's have a blank line between them the item gets wrapped in a para,
  243. %% if not, they don't
  244. %% BUT if one item is <p> wrapped then the next is too
  245. parse_list(_Type, [], _R, _I, A, _) ->
  246. {[], reverse(A)};
  247. parse_list(Type, [{{Type, P}, _} | T], R, I, A, Wrap) ->
  248. {Rest, NewP, NewWrap} = grab(T, R, [], Wrap),
  249. Li = case NewWrap of
  250. false -> Ret = parse([{normal, P}], R),
  251. % need to strip off the extra <p></p>'s
  252. Ret2 = string:left(Ret, length(Ret) - 4),
  253. Ret3 = string:right(Ret2, length(Ret2) -3),
  254. Ret3 ++ "\n" ++ NewP ++ pad(I);
  255. true -> string:strip(parse([{normal, P}], R), right, ?LF)
  256. ++ NewP ++ pad(I)
  257. end,
  258. NewWrap2 = case T of
  259. [] -> false; % doesnt matter
  260. [H2 | _T2] -> case H2 of
  261. {linefeed, _} -> true;
  262. _ -> false
  263. end
  264. end,
  265. parse_list(Type, Rest, R, I, [pad(I) ++ "<li>"
  266. ++ string:strip(Li, right, ?LF)
  267. ++ "</li>\n" | A], NewWrap2);
  268. parse_list(_Type, List, _R, _I, A, _) ->
  269. {List, reverse(A)}.
  270. %% grab grabs normals, double codeblocks, linefeeds and blanks
  271. %% BUT stop grabbing if a normal if preceeded by a linefeed or blank
  272. %% UNLESS the normal starts with white space :(
  273. %% the third return parameter is 'true' if the 'li' should be
  274. %% wrapped in '<p></p>' and false if it shouldn't
  275. grab([{{codeblock, _}, S} | T] = List, R, Acc, W) ->
  276. case is_blockquote(S, T) of
  277. {{true, R1}, T2} -> grab(T2, R,
  278. ["</blockquote>",
  279. make_esc_str(R1, R),
  280. "<blockquote>" | Acc], W);
  281. {{esc_false, R1}, _T2} -> {R1, reverse(Acc), false};
  282. {false, T2} ->
  283. case is_double_indent(S) of
  284. false ->
  285. {List, reverse(Acc), false};
  286. {true, R2} ->
  287. % if it is a double indent - delete 4 spaces
  288. % no it makes not sense to me neither :(
  289. grab(T2, R, [" " ++ make_esc_str(R2, R) | Acc], W)
  290. end
  291. end;
  292. grab([{linefeed, _} | T], R, Acc, false) ->
  293. grab2(T, R, Acc, T, Acc, true);
  294. grab([{linefeed, _} | T], R, Acc, true) ->
  295. grab2(T, R, ["\n" | Acc], T, Acc, true);
  296. grab([{blank, _} | T], R, Acc, false) ->
  297. grab2(T, R, Acc, T, Acc, true);
  298. grab([{blank, _} | T], R, Acc, true) ->
  299. grab2(T, R, ["\n" | Acc], T, Acc, true);
  300. grab([{normal, P} | T], R, Acc, W) ->
  301. Li = case W of
  302. false -> make_esc_str(P, R);
  303. true -> "<p>"++ string:strip(make_esc_str(P, R), right, ?LF)
  304. ++ "</p>"
  305. end,
  306. grab(T, R, [Li | Acc], W);
  307. grab(List, _R, Acc, W) ->
  308. {List, reverse(Acc), W}.
  309. %% the problem is knowing when to grab, if the list is followed by a long
  310. %% string of blank lines and linefeeds and a normal then the linefeeds aren't
  311. %% grabbed
  312. %% if the list if followed by blank lines and linefeeds and a normal with an
  313. %% initial whitespace it is grabbed...
  314. grab2([{normal, P2} | T], R, Acc, LO, AO, W) ->
  315. case P2 of
  316. [{{ws, _}, _} | T2] ->
  317. Li = case W of
  318. false -> make_esc_str(T2, R);
  319. true -> "<p>" ++
  320. string:strip(make_esc_str(T2, R), right, ?LF)
  321. ++ "</p>"
  322. end,
  323. grab(T, R, [Li | Acc], W);
  324. _ ->
  325. {LO, AO, false}
  326. end;
  327. grab2([{linefeed, _} | T], R, Acc, LO, AO, _W) ->
  328. grab2(T, R, ["\n" | Acc], LO, AO, true);
  329. grab2([{blank, _} | T], R, Acc, LO, AO, _W) ->
  330. grab2(T, R, ["\n" | Acc], LO, AO, true);
  331. %% We dont want to grab this stuff so return the old list and the old acc
  332. grab2(_List, _R, _Acc, LO, AO, _W) ->
  333. {LO, AO, true}.
  334. is_double_indent(List) -> is_double_indent1(List, 0).
  335. %% double indent is any combination of tabs and spaces that add
  336. %% up to 8
  337. is_double_indent1([], _N) -> false;
  338. is_double_indent1(Rest, N) when N > 7 -> {true, Rest};
  339. is_double_indent1([{{ws, sp}, _} | T], N) -> is_double_indent1(T, N + 1);
  340. is_double_indent1([{{ws, tab}, _} | T], N) -> is_double_indent1(T, N + 4);
  341. is_double_indent1(_List, _N) -> false.
  342. is_blockquote(List, T) ->
  343. case is_bq1(List, 0) of
  344. false -> {false, T};
  345. {esc_false, R} -> {{esc_false, R}, T};
  346. {true, R} -> {NewT, NewR} = grab2(T, R),
  347. {{true, NewR}, NewT}
  348. end.
  349. is_bq1([], _N) -> false;
  350. is_bq1([{{ws, sp}, _} | T], N) -> is_bq1(T, N + 1);
  351. is_bq1([{{ws, tab}, _} | T], N) -> is_bq1(T, N + 4);
  352. is_bq1([{{md, gt}, _},
  353. {{ws, _}, _} | T], N) when N > 3 -> {true, T};
  354. is_bq1([{{punc, bslash}, _},
  355. {{md, gt}, GT},
  356. {{ws, _}, WS} | T], N) when N > 3 -> {esc_false, [GT, WS | T]};
  357. is_bq1(_List, _N) -> false.
  358. grab2(List, R) -> gb2(List, reverse(R)).
  359. gb2([], Acc) -> {[], flatten(reverse(Acc))};
  360. gb2([{blank, _} | T], Acc) -> {T, flatten(reverse(Acc))};
  361. gb2([{_Type, P} | T], Acc) -> gb2(T, [P | Acc]).
  362. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  363. %%%
  364. %%% Make the lines from the raw tokens
  365. %%%
  366. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  367. make_lines(Tokens) -> ml1(Tokens, [], []).
  368. ml1([], [], A2) -> reverse(A2);
  369. ml1([], A1, A2) -> ml1([], [], [reverse(A1) | A2]);
  370. ml1([{{lf, _}, _} = H | T], A1, A2) -> ml1(T, [], [ml2(H, A1) | A2]);
  371. ml1([H | T], A1, A2) -> ml1(T, [H | A1], A2).
  372. ml2(H, List) -> reverse([H | List]).
  373. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  374. %%%
  375. %%% Process the lines and give each line a type. The valid types are:
  376. %%% * normal line
  377. %%% * reference style links
  378. %%% * reference style images
  379. %%% * special line types
  380. %%% - blank
  381. %%% - SETEXT header lines
  382. %%% - ATX header lines
  383. %%% - unordered lists (including code blocks)
  384. %%% - ordered lists (including code blocks)
  385. %%% - blockquotes
  386. %%% - code blocks
  387. %%% - horizontal rules
  388. %%%
  389. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  390. type_lines(Lines) ->
  391. {Refs, TypedLines} = t_l1(Lines, [], []),
  392. % io:format("TypedLines before stripping ~p~n", [TypedLines]),
  393. {strip_lines(TypedLines), Refs}.
  394. t_l1([], A1, A2) -> {A1, reverse(A2)};
  395. %% this clause extracts URL and Image refs
  396. %% (it is the only one that uses A1 and A2...
  397. %% inlines can have up to 3 spaces before it
  398. t_l1([[{{ws, sp}, _},
  399. {{inline, open}, _} | T1] = H | T2], A1, A2) ->
  400. t_inline(H, T1, T2, A1, A2);
  401. t_l1([[{{ws, tab}, _},
  402. {{inline, open}, _} | T1] = H | T2], A1, A2) ->
  403. t_inline(H, T1, T2, A1, A2);
  404. t_l1([[{{ws, comp}, W},
  405. {{inline, open}, _} | T1] = H | T2], A1, A2) ->
  406. case gt(W, 3) of
  407. {true, _R} -> t_inline(H, T1, T2, A1, A2);
  408. false -> t_l1(T1, A1, [{normal , H} | A2]) % same exit at the final clause!
  409. end,
  410. t_inline(H, T1, T2, A1, A2);
  411. t_l1([[{{inline, open}, _} | T1] = H | T2], A1, A2) ->
  412. t_inline(H, T1, T2, A1, A2);
  413. %% types setext lines
  414. t_l1([[{{md, eq}, _} | _T] = H | T], A1, A2) ->
  415. t_l1(T, A1, [type_setext_h1(H) | A2]);
  416. %% NOTE 1: generates a ul as the default not a normal line
  417. %% NOTE 2: depending on the context this might generate an <h2> header
  418. %% or an <hr />
  419. %% NOTE 3: space - is typed to a bullet down in <ul> land...
  420. t_l1([[{{md, dash}, _} | _T] = H | T], A1, A2) ->
  421. t_l1(T, A1, [type_setext_h2(H) | A2]);
  422. %% types atx lines
  423. t_l1([[{{md, atx}, _} | _T] = H | T], A1, A2) ->
  424. t_l1(T, A1, [type_atx(H) | A2]);
  425. %% types blockquotes
  426. %% a blockquote on its own or followed by a linefeed is
  427. %% displayed 'as is' by showdown
  428. t_l1([[{{md, gt}, _} | []] = H | T], A1, A2) ->
  429. t_l1(T, A1, [{normal, H} | A2]);
  430. t_l1([[{{md, gt}, _}, {{lf, _}, _} | []] = H | T], A1, A2) ->
  431. t_l1(T, A1, [{normal, H} | A2]);
  432. %% one with anything after it starts a blockquote
  433. t_l1([[{{md, gt}, _} | _T1] = H | T], A1, A2) ->
  434. t_l1(T, A1, [{blockquote, H} | A2]);
  435. %% types unordered lists lines
  436. %% NOTE 1: the dashed version is generated in type_setext_h2
  437. %% NOTE 2: the asterix version also might generate a horizontal rule
  438. %% which is why it jumps to type_star2 <-- note the 2!!
  439. t_l1([[{{ws, _}, _}, {{md, star}, _} = ST1,
  440. {{ws, _}, _} = WS1 | T1] = H | T], A1, A2) ->
  441. t_l1(T, A1, [{type_star2([ST1, WS1 | T1]), H} | A2]);
  442. t_l1([[{{md, star}, _}, {{ws, _}, _} | _T1] = H | T], A1, A2) ->
  443. t_l1(T, A1, [{type_star2(H), H} | A2]);
  444. t_l1([[{{ws, _}, _}, {{md, plus}, _},
  445. {{ws, _}, _} = W | T1] = H | T], A1, A2) ->
  446. t_l1(T, A1, [{{ul, make_list_str([W | T1])}, H} | A2]);
  447. t_l1([[{{md, plus}, _}, {{ws, _}, _} = W | T1] = H | T], A1, A2) ->
  448. t_l1(T, A1, [{{ul, make_list_str([W | T1])}, H} | A2]);
  449. %% UL based on dashes
  450. t_l1([[{{ws, _}, _}, {{md, dash}, _},
  451. {{ws, _}, _} = W | T1] = H | T], A1, A2) ->
  452. t_l1(T, A1, [{{ul, make_list_str([W | T1])}, H} | A2]);
  453. %% types ordered lists...
  454. t_l1([[{{ws, _}, _}, {num, _} = N1| T1] | T], A1, A2) ->
  455. t_l1(T, A1, [type_ol([N1 | T1]) | A2]);
  456. t_l1([[{num, _} | _T] = H | T], A1, A2) ->
  457. t_l1(T, A1, [type_ol(H) | A2]);
  458. %% types horizontal rules for stars and underscores
  459. %% dashes and some stars are done elsewhere...
  460. t_l1([[{{md, underscore}, _} | _T1] = H | T], A1, A2) ->
  461. t_l1(T, A1, [type_underscore(H) | A2]);
  462. t_l1([[{{md, star}, _} | _T1] = H | T], A1, A2) ->
  463. t_l1(T, A1, [type_star(H) | A2]);
  464. %% Block level tags - these are look ahead they must be
  465. %% on a single line (ie directly followed by a lf and nothing else
  466. t_l1([[{{{tag, _Type}, Tag}, _ } = H | T1] = List | T], A1, A2) ->
  467. case is_blank(T1) of
  468. false -> t_l1(T, A1, [{normal , List} | A2]);
  469. true -> case is_block_tag(Tag) of
  470. true -> t_l1(T, A1, [{blocktag , [H]} | A2]);
  471. false -> t_l1(T, A1, [{tag, [H | T1]} | A2])
  472. end
  473. end;
  474. %% types a blank line or a code block
  475. t_l1([[{{lf, _}, _}| []] = H | T], A1, A2) ->
  476. t_l1(T, A1, [{linefeed, H} | A2]);
  477. t_l1([[{{ws, _}, _} | _T1] = H | T], A1, A2) ->
  478. t_l1(T, A1, [type_ws(H) | A2]);
  479. %% Final clause...
  480. t_l1([H | T], A1, A2) ->
  481. t_l1(T, A1, [{normal , H} | A2]).
  482. t_inline(H, T1, T2, A1, A2) ->
  483. case snip_ref(T1) of
  484. {Type, {Id, {Url, Title}}} -> t_l1(T2, flatten([{Id, {Url, Title}} | A1]),
  485. [{Type, H} | A2]);
  486. normal -> t_l1(T2, A1, [{normal, H} | A2])
  487. end.
  488. %% strips blanks from the beginning and end
  489. strip_lines(List) -> reverse(strip_l1(reverse(strip_l1(List)))).
  490. strip_l1([{linefeed, _} | T]) -> strip_l1(T);
  491. strip_l1([{blank, _} | T]) -> strip_l1(T);
  492. strip_l1(List) -> List.
  493. %%
  494. %% Loads of type rules...
  495. %%
  496. is_blank([]) -> true;
  497. is_blank([{{lf, _}, _} | []]) -> true;
  498. is_blank([{{ws, _}, _} | T]) -> is_blank(T);
  499. is_blank(_List) -> false.
  500. is_block_tag("address") -> true;
  501. is_block_tag("blockquote") -> true;
  502. is_block_tag("center") -> true;
  503. is_block_tag("dir") -> true;
  504. is_block_tag("div") -> true;
  505. is_block_tag("dl") -> true;
  506. is_block_tag("fieldset") -> true;
  507. is_block_tag("form") -> true;
  508. is_block_tag("h1") -> true;
  509. is_block_tag("h2") -> true;
  510. is_block_tag("h3") -> true;
  511. is_block_tag("h4") -> true;
  512. is_block_tag("h5") -> true;
  513. is_block_tag("h6") -> true;
  514. is_block_tag("hr") -> true;
  515. is_block_tag("isindex") -> true;
  516. is_block_tag("menu") -> true;
  517. is_block_tag("noframes") -> true;
  518. is_block_tag("noscript") -> true;
  519. is_block_tag("ol") -> true;
  520. is_block_tag("p") -> true;
  521. is_block_tag("pre") -> true;
  522. is_block_tag("table") -> true;
  523. is_block_tag("thead") -> true;
  524. is_block_tag("tbody") -> true;
  525. is_block_tag("tr") -> true;
  526. is_block_tag("td") -> true;
  527. is_block_tag("ul") -> true;
  528. is_block_tag(_Other) -> false.
  529. type_underscore(List) ->
  530. case type_underscore1(trim_right(List)) of
  531. hr -> {hr, List};
  532. maybe -> {type_underscore2(List), List}
  533. end.
  534. type_underscore1([]) -> hr;
  535. type_underscore1([{{md, underscore}, _} | T]) -> type_underscore1(T);
  536. type_underscore1(_List) -> maybe.
  537. type_underscore2(List) ->
  538. case trim_right(List) of % be permissive of trailing spaces
  539. [{{md, underscore}, _}, {{ws, _}, _},
  540. {{md, underscore}, _}, {{ws, _}, _},
  541. {{md, underscore}, _}] -> hr;
  542. _Other -> normal
  543. end.
  544. type_star(List) ->
  545. Trim = trim_right(List),
  546. case type_star1(Trim) of % be permssive of trailing spaces
  547. hr -> {hr, trim_right(Trim)};
  548. maybe -> Type = type_star2(List),
  549. % if it is a normal line we prepend it with a special
  550. % non-space filling white space character
  551. case Type of
  552. normal -> {normal, [{{ws, none}, none} | List]};
  553. _ -> {Type, List}
  554. end
  555. end.
  556. type_star1([]) -> hr;
  557. type_star1([{{md, star}, _} | T]) -> type_star1(T);
  558. type_star1(_List) -> maybe.
  559. type_star2(List) ->
  560. case trim_right(List) of
  561. [{{md, star}, _}, {{ws, _}, _},
  562. {{md, star}, _}, {{ws, _}, _},
  563. {{md, star}, _}] -> hr;
  564. _Other ->
  565. case List of
  566. [{{md, star}, _},
  567. {{ws, _}, _}= WS | T] -> {ul, make_list_str([WS | T])};
  568. _Other2 -> normal
  569. end
  570. end.
  571. type_ol(List) ->
  572. case type_ol1(List, []) of
  573. normal -> {normal, List};
  574. {ol, Str} -> {{ol, Str}, List};
  575. {esc_normal, Str} -> {normal, Str}
  576. end.
  577. %% this line terminates on an escaped fullstop after a number
  578. %% (but you need to drop the bslash...)
  579. type_ol1([{num, _} = N,
  580. {{punc, bslash}, _},
  581. {{punc, fullstop}, _} = P | T], Acc) ->
  582. {esc_normal, flatten([reverse(Acc), N, P | T])};
  583. %% we accumulate the digits in case we need to escape a full stop in a normal line
  584. type_ol1([{num, _} = H | T], Acc) -> type_ol1(T, [H | Acc]);
  585. type_ol1([{{punc, fullstop}, _},
  586. {{ws, _}, _} | T], _Acc) -> {ol, T};
  587. type_ol1(_List, _Acc) -> normal.
  588. %% You need to understand what this function is trying to d...
  589. %% '### blah' is fine
  590. %% '### blah ###' is reduced to '### blah' because trailing #'s are
  591. %% just for show but...
  592. %% '##' is like appling '#' to '#' <-- applying 1 less styling to a single #
  593. %% and '###' is like appling '##' to '#' etc, etc
  594. %% but after you hit 6#'s you just get this for a single hash
  595. %% ie '#############' is like applying '######' to a single '#'
  596. %% but/and '######## blah' is like apply '######' to '## blah'
  597. %% strip trailing #'s as they are decorative only...
  598. type_atx(List) ->
  599. {Sz, R} = get_atx_size(List),
  600. A = [{{md, atx}, "#"}],
  601. Type =
  602. case is_all_hashes(R) of
  603. true ->
  604. if
  605. Sz == 1 ->
  606. normal;
  607. ((Sz > 1) andalso (Sz < 6)) ->
  608. Ns = integer_to_list(Sz - 1),
  609. Hn = list_to_atom("h" ++ Ns),
  610. {Hn, A};
  611. ((Sz == 6) andalso (R == [])) ->
  612. {h5, A};
  613. ((Sz == 6) andalso (R == [{{lf, lf}, "\n"}])) ->
  614. {h5, A};
  615. ((Sz == 6) andalso (R == [{{lf, crlf}, "\r\n"}])) ->
  616. {h5, A};
  617. ((Sz == 6) andalso (R =/= [])) ->
  618. {h6, A}
  619. end;
  620. false ->
  621. Ns = integer_to_list(Sz),
  622. Hn = list_to_atom("h" ++ Ns),
  623. {Hn, strip_atx(R)}
  624. end,
  625. {Type, List}.
  626. is_all_hashes([]) -> true;
  627. is_all_hashes([{{md, atx}, _} | T]) -> is_all_hashes(T);
  628. is_all_hashes([{{lf, _}, _} | []]) -> true;
  629. is_all_hashes(_List) -> false.
  630. get_atx_size(List) -> g_atx_size1(List, 0).
  631. % this function also strips whitespace to the left...
  632. g_atx_size1([{{md, atx}, _} = A | T], N) when N == 6 -> {6, [A | T]};
  633. g_atx_size1([{{md, atx}, _} | T], N) -> g_atx_size1(T, N + 1);
  634. g_atx_size1([{{ws, _}, _} | T], N) -> g_atx_size1(T, N);
  635. g_atx_size1(List, N) -> {N, List}.
  636. strip_atx(List) -> reverse(s_atx1(reverse(List))).
  637. s_atx1([{{lf, _}, _}, {{md, atx}, _} | T]) -> s_atx1(T);
  638. s_atx1([{{md, atx}, _} | T]) -> s_atx1(T);
  639. s_atx1(List) -> List.
  640. type_setext_h1(List) -> type_s_h1_1(List, []).
  641. %% terminates on running out or new line
  642. type_s_h1_1([{{lf, _}, _} = L | []], Acc) -> {setext_h1, reverse([L | Acc])};
  643. type_s_h1_1([], Acc) -> {setext_h1, reverse(Acc)};
  644. type_s_h1_1([[] | T], Acc) -> type_s_h1_1(T, Acc);
  645. type_s_h1_1([{{md, eq}, _} = H | T], Acc) -> type_s_h1_1(T, [H | Acc]);
  646. type_s_h1_1(L, Acc) -> {normal, flatten([Acc | L])}.
  647. type_setext_h2(List) ->
  648. case type_s_h2_1(List) of
  649. h2_or_hr -> {h2_or_hr, List};
  650. not_h2 -> {type_s_h2_2(trim_right(List)), List}
  651. end.
  652. %% terminates on running out or new line
  653. type_s_h2_1([{{lf, _}, _} | []]) -> h2_or_hr;
  654. type_s_h2_1([]) -> h2_or_hr;
  655. type_s_h2_1([[] | T]) -> type_s_h2_1(T);
  656. type_s_h2_1([{{md, dash}, _} | T]) -> type_s_h2_1(T);
  657. type_s_h2_1(_L) -> not_h2.
  658. type_s_h2_2([{{md, dash}, _}, {{ws,_}, _},
  659. {{md, dash}, _}, {{ws, _}, _},
  660. {{md, dash}, _}]) -> hr;
  661. type_s_h2_2([{{md, dash}, _},
  662. {{ws, _}, _} = WS | T]) -> {ul, make_list_str([WS | T])};
  663. type_s_h2_2(_List) -> normal.
  664. type_ws(List) ->
  665. case type_ws1(List) of
  666. blank -> {blank, List};
  667. try_codeblock ->
  668. case type_ws2(List) of
  669. normal -> {normal, List};
  670. {codeblock, Ret} -> {{codeblock, Ret}, List}
  671. end
  672. end.
  673. type_ws1([]) -> blank;
  674. type_ws1([{{lf, _}, _} | []]) -> blank;
  675. type_ws1([[] | T]) -> type_ws1(T);
  676. type_ws1([{{ws, _}, _} | T]) -> type_ws1(T);
  677. type_ws1(_L) -> try_codeblock.
  678. %% 4 or more spaces takes you over the limit
  679. %% (a tab is 4...)
  680. type_ws2([{{ws, tab}, _} | T]) -> {codeblock, T};
  681. type_ws2([{{ws, comp}, W} | T]) -> case gt(W, 4) of
  682. {true, R} -> {codeblock, [R| T]};
  683. false -> normal
  684. end;
  685. type_ws2([{{ws, sp}, _} | _T]) -> normal.
  686. gt(String, Len) ->
  687. ExpString = re:replace(String, "\t", " ", [{return, list}]),
  688. ExpStringLen = length(ExpString),
  689. if
  690. ExpStringLen >= Len -> WS = string:substr(ExpString, Len + 1,
  691. ExpStringLen),
  692. {true, {{ws, sp}, WS}};
  693. ExpStringLen < Len -> false
  694. end.
  695. %% make a tag into a string
  696. make_tag_str(L, R) -> make_tag1(L, R, []).
  697. make_tag1([], _R, Acc) -> lists:reverse(Acc);
  698. make_tag1([{{{tag, _Type}, _Tag}, B} | T], R, Acc) ->
  699. make_tag1(T, R, [B | Acc]);
  700. make_tag1([H | T], R, Acc) ->
  701. make_tag1(T, R, [make_str([H], R) | Acc]).
  702. esc_tag(String) -> esc_t1(String, []).
  703. esc_t1([], Acc) -> lists:reverse(Acc);
  704. esc_t1([?NBSP | T], Acc) -> esc_t1(T, [?SPACE | Acc]); % non-breaking space to space
  705. esc_t1([H | T], Acc) -> esc_t1(T, [H | Acc]).
  706. %% if it is a list we need to discard the initial white space...
  707. make_list_str([{{ws, _}, _} | T] = List) ->
  708. case is_double_indent(List) of
  709. false -> T;
  710. {true, R} -> flatten([{tags, "<pre><code>"} ,R ,
  711. {tags, "</code></pre>\n\n"} | []])
  712. end.
  713. %% All ref processing can ignore the original values 'cos those
  714. %% have already been captured at a higher level
  715. snip_ref(List) ->
  716. case get_id(List) of
  717. {[{_, Id}], Rest} -> {_Rest2, Ref, Title} = parse_inline(Rest),
  718. Ref2 = trim(Ref),
  719. Rs = htmlencode(make_plain_str(Ref2)),
  720. Ts = make_plain_str(Title),
  721. {inlineref, {Id, {Rs, Ts}}};
  722. normal -> normal
  723. end.
  724. get_id(List) -> g_id1(List, []).
  725. g_id1([], _Acc) -> normal;
  726. g_id1([{{inline, close}, _},
  727. {{punc, colon}, _}, {{ws, _}, _}
  728. | T], Acc) -> {reverse(Acc), T};
  729. g_id1([H | T], Acc) -> g_id1(T, [H | Acc]).
  730. parse_inline(List) -> p_in1(List, []).
  731. %% snip off the terminal linefeed (if there is one...)
  732. p_in1([{{lf, _}, _} | []], A) -> {[], reverse(A), []};
  733. p_in1([], A) -> {[], reverse(A), []};
  734. %% brackets can be escaped
  735. p_in1([{{punc, bslash}, _},
  736. {bra, _} = B | T], A) -> p_in1(T, [B | A]);
  737. p_in1([{{punc, bslash}, _},
  738. {ket, _} = B | T], A) -> p_in1(T, [B | A]);
  739. p_in1([{{punc, bslash}, _},
  740. {{punc, doubleq}, _} = Q | T], A) -> p_in1(T, [Q | A]);
  741. p_in1([{{punc, bslash}, _},
  742. {{punc, singleq}, _} = Q | T], A) -> p_in1(T, [Q | A]);
  743. %% these clauses capture the start of the title...
  744. p_in1([{{punc, doubleq}, _} | T], A) -> p_in2(T, reverse(A), doubleq, []);
  745. p_in1([{{punc, singleq}, _} | T], A) -> p_in2(T, reverse(A), singleq, []);
  746. p_in1([{bra, _} | T], A) -> p_in2(T, reverse(A), brackets, []);
  747. p_in1([{ket, _} | T], A) -> {T, reverse(A), []};
  748. p_in1([H | T], A) -> p_in1(T, [H | A]).
  749. %% this gets titles in single and double quotes
  750. %% the delimiter type is passed in as 'D'
  751. p_in2([], Url, _D, A) -> {[], Url, flatten(reverse(A))};
  752. %% brackets can be escaped
  753. p_in2([{{punc, bslash}, _},
  754. {bra, _} = B | T], Url, D, A) -> p_in2(T, Url, D, [B | A]);
  755. p_in2([{{punc, bslash}, _},
  756. {ket, _} = B | T], Url, D, A) -> p_in2(T, Url, D, [B | A]);
  757. %% quotes can be escaped
  758. p_in2([{{punc, bslash}, _},
  759. {{punc, doubleq}, _}= Q | T], Url, D, A) -> p_in2(T, Url, D, [Q | A]);
  760. p_in2([{{punc, bslash}, _},
  761. {{punc, singleq}, _} = Q | T], Url, D, A) -> p_in2(T, Url, D, [Q | A]);
  762. %% these clauses capture the end of the title and drop the delimiter...
  763. p_in2([{{punc, doubleq}, _} | T], Url, doubleq, A) -> p_in2(T, Url, none, A);
  764. p_in2([{{punc, singleq}, _} | T], Url, singleq, A) -> p_in2(T, Url, none, A);
  765. p_in2([{ket, _} | T], Url, brackets, A) -> p_in2(T, Url, none, A);
  766. %% terminator clause
  767. p_in2([{ket, _} | T], Url, none, A) -> {T, Url, flatten(reverse(A))};
  768. %% this clause silently discards stuff after the delimiter...
  769. p_in2([_H | T], Url, none, A) -> p_in2(T, Url, none, [A]);
  770. p_in2([H | T], Url, D, A) -> p_in2(T, Url, D, [H | A]).
  771. trim(String) -> trim_left(trim_right(String)).
  772. trim_right(String) -> reverse(trim_left(reverse(String))).
  773. trim_left([{{ws, _}, _} | T]) -> trim_left(T);
  774. trim_left([[] | T]) -> trim_left(T);
  775. trim_left(List) -> List.
  776. snip(List) -> List2 = reverse(List),
  777. case List2 of
  778. [{{lf, _}, _} | T] -> lists:reverse(T);
  779. _ -> List
  780. end.
  781. %% end of ref processing
  782. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  783. %%%
  784. %%% Build the Lexed Token List
  785. %%% This is a two part lexer, first it chunks the input and then on the second
  786. %%% pass it gathers it into lines and types the lines
  787. %%%
  788. %%% NOTE that there are two different styles of processing lines:
  789. %%% * markdown transformed
  790. %%% * block
  791. %%% inside block processing the whole text is dumped and just url encoded
  792. %%% and the original text is always maintained during the lexing/parsing
  793. %%% so that it can be recreated if the context requires it...
  794. %%%
  795. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  796. lex(String) -> merge_ws(l1(String, [], [])).
  797. merge_ws(List) -> m_ws1(List, []).
  798. m_ws1([], Acc) -> reverse(Acc);
  799. m_ws1([{{ws, _}, W1}, {{ws, _}, W2} | T], Acc) ->
  800. m_ws1([{{ws, comp}, W1 ++ W2} | T], Acc);
  801. m_ws1([H | T], Acc) -> m_ws1(T, [H | Acc]).
  802. %% this is the terminal head which ends the parsing...
  803. l1([], [], A2) -> flatten(reverse(A2));
  804. l1([], A1, A2) -> l1([], [], [l2(A1) | A2]);
  805. %% these two heads capture opening and closing tags
  806. l1([$<, $/|T], A1, A2) -> {Tag, NewT} = closingdiv(T, []),
  807. l1(NewT, [], [Tag, l2(A1) | A2]);
  808. l1([$< | T], A1, A2) -> {Tag, NewT} = openingdiv(T),
  809. l1(NewT, [], [Tag , l2(A1) | A2]);
  810. %% these clauses are the normal lexer clauses
  811. l1([$= | T], A1, A2) -> l1(T, [], [{{md, eq}, "="}, l2(A1) | A2]);
  812. l1([$- | T], A1, A2) -> l1(T, [], [{{md, dash}, "-"}, l2(A1) | A2]);
  813. l1([$# | T], A1, A2) -> l1(T, [], [{{md, atx}, "#"}, l2(A1) | A2]);
  814. l1([$> | T], A1, A2) -> l1(T, [], [{{md, gt}, ">"}, l2(A1) | A2]);
  815. l1([$+ | T], A1, A2) -> l1(T, [], [{{md, plus}, "+"}, l2(A1) | A2]);
  816. l1([$* | T], A1, A2) -> l1(T, [], [{{md, star}, "*"}, l2(A1) | A2]);
  817. l1([$_ | T], A1, A2) -> l1(T, [], [{{md, underscore}, "_"}, l2(A1) | A2]);
  818. l1([$1 | T], A1, A2) -> l1(T, [], [{num, "1"}, l2(A1) | A2]);
  819. l1([$2 | T], A1, A2) -> l1(T, [], [{num, "2"}, l2(A1) | A2]);
  820. l1([$3 | T], A1, A2) -> l1(T, [], [{num, "3"}, l2(A1) | A2]);
  821. l1([$4 | T], A1, A2) -> l1(T, [], [{num, "4"}, l2(A1) | A2]);
  822. l1([$5 | T], A1, A2) -> l1(T, [], [{num, "5"}, l2(A1) | A2]);
  823. l1([$6 | T], A1, A2) -> l1(T, [], [{num, "6"}, l2(A1) | A2]);
  824. l1([$7 | T], A1, A2) -> l1(T, [], [{num, "7"}, l2(A1) | A2]);
  825. l1([$8 | T], A1, A2) -> l1(T, [], [{num, "8"}, l2(A1) | A2]);
  826. l1([$9 | T], A1, A2) -> l1(T, [], [{num, "9"}, l2(A1) | A2]);
  827. l1([$0 | T], A1, A2) -> l1(T, [], [{num, "0"}, l2(A1) | A2]);
  828. l1([$. | T], A1, A2) -> l1(T, [], [{{punc, fullstop}, "."}, l2(A1) | A2]);
  829. l1([$: | T], A1, A2) -> l1(T, [], [{{punc, colon}, ":"}, l2(A1) | A2]);
  830. l1([$' | T], A1, A2) -> l1(T, [], [{{punc, singleq}, "'"}, l2(A1) | A2]); %'
  831. l1([$" | T], A1, A2) -> l1(T, [], [{{punc, doubleq}, "\""}, l2(A1) | A2]); %"
  832. l1([$` | T], A1, A2) -> l1(T, [], [{{punc, backtick}, "`"}, l2(A1) | A2]); %"
  833. l1([$! | T], A1, A2) -> l1(T, [], [{{punc, bang}, "!"}, l2(A1) | A2]); %"
  834. l1([$\\ | T], A1, A2) -> l1(T, [], [{{punc, bslash}, "\\"}, l2(A1) | A2]); %"
  835. l1([$/ | T], A1, A2) -> l1(T, [], [{{punc, fslash}, "/"}, l2(A1) | A2]); %"
  836. l1([$( | T], A1, A2) -> l1(T, [], [{bra, "("}, l2(A1) | A2]);
  837. l1([$) | T], A1, A2) -> l1(T, [], [{ket, ")"}, l2(A1) | A2]);
  838. l1([$[ | T], A1, A2) -> l1(T, [], [{{inline, open}, "["}, l2(A1) | A2]);
  839. l1([$] | T], A1, A2) -> l1(T, [], [{{inline, close}, "]"}, l2(A1) | A2]);
  840. %% note there is a special 'whitespace' {{ws, none}, ""} which is used to generate non-space
  841. %% filling whitespace for cases like '*bob* is great' which needs a non-space filling
  842. %% whitespace prepended to trigger emphasis so it renders as "<em>bob</em> is great...
  843. %% that 'character' doesn't exist so isn't in the lexer but appears in the parser
  844. l1([?SPACE | T], A1, A2) -> l1(T, [], [{{ws, sp}, " "}, l2(A1) | A2]);
  845. l1([?TAB | T], A1, A2) -> l1(T, [], [{{ws, tab}, "\t"}, l2(A1) | A2]);
  846. l1([?NBSP | T], A1, A2) -> l1(T, [], [{{ws, sp}, "&nbsp"}, l2(A1) | A2]);
  847. l1([?CR, ?LF | T], A1, A2) -> l1(T, [], [{{lf, crlf}, [?CR , ?LF]}, l2(A1) | A2]);
  848. l1([?LF | T], A1, A2) -> l1(T, [], [{{lf, lf}, [?LF]}, l2(A1) | A2]);
  849. %% l1([?CR | T], A1, A2) -> l1(T, [], [{{lf, cr}, [?CR]}, l2(A1) | A2]);
  850. %% this final clause accumulates line fragments
  851. l1([H|T], A1, A2) -> l1(T, [H |A1] , A2).
  852. l2([]) -> [];
  853. l2(List) -> {string, flatten(reverse(List))}.
  854. %% need to put in regexes for urls and e-mail addies
  855. openingdiv(String) ->
  856. case get_url(String) of
  857. {{url, URL}, R1} -> {{url, URL}, R1};
  858. not_url ->
  859. case get_email_addie(String) of
  860. {{email, EM}, R2} -> {{email, EM}, R2};
  861. not_email -> openingdiv1(String, [])
  862. end
  863. end.
  864. % dumps out a list if it is not an opening div
  865. openingdiv1([], Acc) -> {flatten([{{punc, bra}, "<"}
  866. | lex(reverse(Acc))]), []};
  867. openingdiv1([$/,$>| T], Acc) -> Acc2 = flatten(reverse(Acc)),
  868. Acc3 = string:to_lower(Acc2),
  869. [Tag | _T] = string:tokens(Acc3, " "),
  870. {{{{tag, self_closing}, Tag}, "<"
  871. ++ Acc2 ++ "/>"}, T};
  872. %% special for non-tags
  873. openingdiv1([$>| T], []) -> {[{{punc, bra}, "<"},
  874. {{punc, ket}, ">"}], T};
  875. openingdiv1([$>| T], Acc) -> Acc2 = flatten(reverse(Acc)),
  876. Acc3 = string:to_lower(Acc2),
  877. [Tag | _T] = string:tokens(Acc3, " "),
  878. {{{{tag, open}, Tag}, "<"
  879. ++ Acc2 ++ ">"}, T};
  880. openingdiv1([H|T], Acc) -> openingdiv1(T, [H | Acc]).
  881. % dumps out a list if it is not an closing div
  882. closingdiv([], Acc) -> {flatten([{{punc, bra}, "<"},
  883. {{punc, fslash}, "/"}
  884. | lex(reverse(Acc))]), []};
  885. closingdiv([$>| T], Acc) -> Acc2 = flatten(reverse(Acc)),
  886. Acc3 = string:to_lower(Acc2),
  887. [Tag | _T] = string:tokens(Acc3, " "),
  888. {{{{tag, close}, Tag}, "</"
  889. ++ Acc2 ++ ">"}, T};
  890. closingdiv([H|T], Acc) -> closingdiv(T, [H | Acc]).
  891. get_url(String) -> HTTP_regex = "^(H|h)(T|t)(T|t)(P|p)(S|s)*://",
  892. case re:run(String, HTTP_regex) of
  893. nomatch -> not_url;
  894. {match, _} -> get_url1(String, [])
  895. end.
  896. get_url1([], Acc) -> URL = flatten(reverse(Acc)),
  897. {{url, URL}, []};
  898. % allow escaped kets
  899. get_url1([$\\, $> | T], Acc) -> get_url1(T, [$>, $\\ | Acc]);
  900. get_url1([$> | T], Acc) -> URL = flatten(reverse(Acc)),
  901. {{url, URL}, T};
  902. get_url1([H | T], Acc) -> get_url1(T, [H | Acc]).
  903. get_email_addie(String) ->
  904. Snip_regex = ">",
  905. case re:run(String, Snip_regex) of
  906. nomatch -> not_email;
  907. {match, [{N, _} | _T]} ->
  908. {Possible, [$> | T]} = lists:split(N, String),
  909. EMail_regex = "[a-z0-9!#$%&'*+/=?^_`{|}~-]+"
  910. ++ "(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*"
  911. ++ "@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+"
  912. ++ "(?:[a-zA-Z]{2}|com|org|net|gov|mil"
  913. ++ "|biz|info|mobi|name|aero|jobs|museum)",
  914. case re:run(Possible, EMail_regex) of
  915. nomatch -> not_email;
  916. {match, _} -> {{email, Possible}, T}
  917. end
  918. end.
  919. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  920. %%%
  921. %%% Internal functions
  922. %%%
  923. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  924. make_plain_str(List) -> m_plain(List, []).
  925. m_plain([], Acc) -> flatten(reverse(Acc));
  926. m_plain([{{ws, none}, none} | T], Acc) -> m_plain(T, [" " | Acc]);
  927. m_plain([{_, Str} | T], Acc) -> m_plain(T, [Str | Acc]).
  928. make_esc_str(List, Refs) -> m_esc(List, Refs, []).
  929. m_esc([], _R, A) -> flatten(reverse(A));
  930. m_esc([{tags, Tag} | T], R, A) -> m_esc(T, R, [{tags, Tag} | A]);
  931. m_esc([H | T], R, A) -> m_esc(T, R, [make_str([H], R) | A]).
  932. make_str(List, Refs) -> m_str1(List, Refs, []).
  933. m_str1([], _R, A) ->
  934. Flat = flatten(reverse(A)),
  935. htmlchars(Flat);
  936. m_str1([{{punc, bang}, B}, {{inline, open}, O} | T], R, A) ->
  937. case get_inline(T, R, [], img) of
  938. {Rest, {Url, Title, Acc}} -> Tag = [make_img_tag(Url, Acc, Title)],
  939. m_str1(Rest, R, [Tag | A]);
  940. {Rest, Tag} -> m_str1(Rest, R, [Tag, O, B | A])
  941. end;
  942. %% escape inline open's...
  943. m_str1([{{punc, bslash}, _}, {{inline, open}, O} | T], R, A) ->
  944. m_str1(T, R, [O | A]);
  945. m_str1([{{inline, open}, O} | T], R, A) ->
  946. case get_inline(T, R, [], url) of
  947. {Rest, {Url, Title, Acc}} ->
  948. Tit = case Title of
  949. [] -> [];
  950. _ -> " title=\"" ++ Title ++ "\""
  951. end,
  952. Tag = [{tags, "<a href=\"" ++ Url ++ "\""
  953. ++ Tit ++ ">"}, Acc,
  954. {tags, "</a>"} | []],
  955. m_str1(Rest, R, [Tag | A]);
  956. {Rest, Tag} ->
  957. m_str1(Rest, R, [Tag, O | A])
  958. end;
  959. m_str1([{email, Addie} | T], R, A) ->
  960. m_str1(T, R, [{tags, "\" />"}, Addie, {tags, "<a href=\"mailto:"}| A]);
  961. m_str1([{url, Url} | T], R, A) ->
  962. m_str1(T, R, [ {tags, "</a>"}, Url, {tags, "\">"}, Url,
  963. {tags, "<a href=\""} | A]);
  964. m_str1([{tags, _} = Tag | T], R, A) ->
  965. m_str1(T, R, [Tag | A]);
  966. m_str1([{{{tag, Type}, Tag}, _} | T], R, A) ->
  967. Tag2 = esc_tag(Tag),
  968. TagStr = case Type of
  969. open -> {tags, "&lt;" ++ Tag2 ++ "&gt;"};
  970. close -> {tags, "&lt;/" ++ Tag2 ++ "&gt;"};
  971. self_closing -> {tags, "&lt;" ++ Tag2 ++ " /&gt;"}
  972. end,
  973. m_str1(T, R, [TagStr | A]);
  974. m_str1([{_, Orig} | T], R, A) ->
  975. m_str1(T, R, [Orig | A]).
  976. % if the inline doesn't terminate its not an inline...
  977. get_inline([], _R, A, _) ->
  978. {[], make_plain_str(reverse(A))};
  979. % a url can contain an image inline
  980. get_inline([{{punc, bang}, _B}, {{inline, open}, _O} | T], R, A, url) ->
  981. {Rest, {Url, Title, Acc}} = get_inline(T, R, A, img),
  982. Tag = make_img_tag(Url, Acc, Title),
  983. % We double tag the tag so that it can get through the flatteners..
  984. get_inline(Rest, R, [{tags, Tag} | A], url);
  985. get_inline([{{inline, close}, _}, {bra, _} | T], _R, A, _) ->
  986. {Rest, Url, Title} = parse_inline(T),
  987. Tag = {string:strip(make_plain_str(Url)),
  988. make_plain_str(Title),
  989. make_plain_str(reverse(A))},
  990. {Rest, Tag};
  991. %% for img's but not url's you need to allow a single space between them
  992. %% to be compatible with showdown :(
  993. get_inline([{{inline, close}, _}, {{ws, sp}, _}, {bra, _} | T], _R, A, img) ->
  994. {Rest, Url, Title} = parse_inline(T),
  995. Tag = {string:strip(make_plain_str(Url)),
  996. make_plain_str(Title),
  997. make_plain_str(reverse(A))},
  998. {Rest, Tag};
  999. %% this clause detects references to images/links...
  1000. get_inline([{{inline, close}, _}, {{inline, open}, _} | T], R, A, _) ->
  1001. Text = make_plain_str(reverse(A)),
  1002. case get_id_diff(T) of
  1003. normal -> {[], make_plain_str(reverse(A))};
  1004. {[{_, Id}], Rest} ->
  1005. {Url, Title} = case lists:keyfind(Id, 1, R) of
  1006. false -> {"", ""};
  1007. {Id, {U, Tit}} -> {U, Tit}
  1008. end,
  1009. Tag = {Url, Title, Text},
  1010. {Rest, Tag};
  1011. _Other -> {[], make_plain_str(reverse(A))} % random failing id's
  1012. end;
  1013. %% so does this one - just delete the space and rethrow it
  1014. get_inline([{{inline, close}, _} = C , {{ws, _}, _},
  1015. {{inline, open}, _} = O | T], R, A, Type) ->
  1016. get_inline([C, O | T], R, A, Type);
  1017. %% this is the markdown extension clause that takes an id in square brackets without
  1018. %% any additional stuff as a valid id marker
  1019. get_inline([{{inline, close}, _} | T], R, A, _) ->
  1020. Id = make_plain_str(reverse(A)),
  1021. case lists:keyfind(Id, 1, R) of
  1022. false -> {T, flatten([Id , $]])};
  1023. {Id, {Url, Title}} -> Tag = {Url, Title, Id},
  1024. {T, Tag}
  1025. end;
  1026. get_inline([H | T], R, A, Type) ->
  1027. get_inline(T, R, [H | A], Type).
  1028. get_id_diff(List) -> g_id_diff1(List, []).
  1029. g_id_diff1([], _Acc) -> normal;
  1030. g_id_diff1([{{inline, close}, _}| T], Acc) -> {reverse(Acc), T};
  1031. g_id_diff1([H | T], Acc) -> g_id_diff1(T, [H | Acc]).
  1032. %% convert ascii into html characters
  1033. htmlencode(List) ->
  1034. htmlencode(List, []).
  1035. htmlencode([], Acc) ->
  1036. lists:flatten(lists:reverse(Acc));
  1037. htmlencode([$& | Rest], Acc) -> htmlencode(Rest, ["&amp;" | Acc]);
  1038. htmlencode([$< | Rest], Acc) -> htmlencode(Rest, ["&lt;" | Acc]);
  1039. htmlencode([$> | Rest], Acc) -> htmlencode(Rest, ["&gt;" | Acc]);
  1040. htmlencode([160 | Rest], Acc) -> htmlencode(Rest, ["&nbsp;" | Acc]);
  1041. htmlencode([Else | Rest], Acc) -> htmlencode(Rest, [Else | Acc]).
  1042. htmlchars(List) -> htmlchars1(List, []).
  1043. htmlchars1([], Acc) -> flatten(reverse(Acc));
  1044. %% tags are just wheeched out unescaped
  1045. htmlchars1([{tags, Tag} | T], Acc) -> htmlchars1(T, [Tag | Acc]);
  1046. %% line ends are pushed to a space..
  1047. htmlchars1([?CR, ?LF | T], Acc) -> htmlchars1(T, ["\n" | Acc]);
  1048. htmlchars1([?LF | T], Acc) -> htmlchars1(T, ["\n" | Acc]);
  1049. htmlchars1([?CR | T], Acc) -> htmlchars1(T, ["\r" | Acc]);
  1050. %% emphasis is a bit strange - must be preceeded by or followed by
  1051. %% white space to work and can also be escaped
  1052. %% there is a non-space filling white space represented by the atom 'none'
  1053. %% which is created in the parser (NOT IN THE LEXER!) and which triggers
  1054. %% emphasis or strong tags being turned on...
  1055. htmlchars1([$\\, $*, $*, $* | T], A) -> htmlchars1(T, [$*, $*, $* | A]);
  1056. htmlchars1([$*, $*, $* | T], A) -> {T2, NewA} = superstrong(T, $*),
  1057. htmlchars1(T2, [NewA | A]);
  1058. % repeat for strong
  1059. htmlchars1([$\\, $*, $* | T], A) -> htmlchars1(T, [$*, $* | A]);
  1060. htmlchars1([$*, $* | T], A) -> {T2, NewA} = strong(T, $*),
  1061. htmlchars1(T2, [NewA | A]);
  1062. %% likewise for strong
  1063. htmlchars1([$\\, $* | T], A) -> htmlchars1(T, [$* | A]);
  1064. htmlchars1([$* | T], A) -> {T2, NewA} = emphasis(T, $*),
  1065. htmlchars1(T2, [NewA | A]);
  1066. %% and again for underscores
  1067. htmlchars1([$\\, $_, $_, $_ | T], A) -> htmlchars1(T, [$_, $_, $_ | A]);
  1068. %% the none atom is the non-space filling whitespace
  1069. htmlchars1([$_, $_, $_ | T], A) -> {T2, NewA} = superstrong(T, $_),
  1070. htmlchars1(T2, [NewA | A]);
  1071. % and strong
  1072. %% and again for underscores
  1073. htmlchars1([$\\, $_, $_ | T], A) -> htmlchars1(T, [$_, $_ | A]);
  1074. htmlchars1([$_, $_ | T], A) -> {T2, NewA} = strong(T, $_),
  1075. htmlchars1(T2, [NewA | A]);
  1076. %% likewise for strong
  1077. htmlchars1([$\\, $_ | T], A) -> htmlchars1(T, [$_ | A]);
  1078. htmlchars1([$_ | T], A) -> {T2, NewA} = emphasis(T, $_),
  1079. htmlchars1(T2, [NewA | A]);
  1080. %% handle backtick escaping
  1081. htmlchars1([$\\, $` | T], A) -> htmlchars1(T, [$` | A]);
  1082. htmlchars1([$`, $` | T], A) -> {T2, NewA} = dblcode(T),
  1083. htmlchars1(T2, [NewA | A]);
  1084. htmlchars1([$` | T], A) -> {T2, NewA} = code(T),
  1085. htmlchars1(T2, [NewA | A]);
  1086. htmlchars1([?COPY | T], A) -> htmlchars1(T, ["&copy;" | A]);
  1087. htmlchars1([?AMP | T], A) -> htmlchars1(T, ["&amp;" | A]);
  1088. htmlchars1([$& | T], A) -> htmlchars1(T, ["&amp;" | A]);
  1089. htmlchars1([$< | T], A) -> htmlchars1(T, ["&lt;" | A]);
  1090. htmlchars1([?NBSP | T], A) -> htmlchars1(T, ["&nbsp;" | A]);
  1091. htmlchars1([?TAB | T], A) -> htmlchars1(T, [" " | A]);
  1092. htmlchars1([none | T], A) -> htmlchars1(T, A);
  1093. htmlchars1([H | T], A) -> htmlchars1(T, [H | A]).
  1094. emphasis(List, Delim) -> interpolate(List, Delim, "em", "" ,[]).
  1095. strong(List, Delim) -> interpolate2(List, Delim, "strong", "", []).
  1096. superstrong(List, Delim) -> interpolate3(List, Delim, "strong", "em", "", []).
  1097. dblcode(List) -> {T, Tag} = interpolate2(List, $`, "code", "" ,[]),
  1098. {T, "<pre>" ++ Tag ++ "</pre>"}.
  1099. code(List) -> interpolateX(List, $`, "code", "", []).
  1100. %% pain in the arse - sometimes the closing tag should be preceded by
  1101. %% a "\n" and sometimes not in showdown.js
  1102. %% interpolate is for single delimiters...
  1103. interpolateX([], Delim, _Tag, _X, Acc) ->
  1104. {[], [Delim] ++ htmlchars(reverse(Acc))};
  1105. interpolateX([Delim | T], Delim, Tag, X, Acc) ->
  1106. {T, "<" ++ Tag ++ ">" ++ htmlchars(reverse(Acc)) ++ X ++
  1107. "</" ++ Tag ++ ">"};
  1108. interpolateX([H | T], Delim, Tag, X, Acc) ->
  1109. interpolateX(T, Delim, Tag, X, [H | Acc]).
  1110. interpolate([], Delim, _Tag, _X, Acc) ->
  1111. {[], [Delim] ++ htmlchars(reverse(Acc))};
  1112. interpolate([Delim | T], Delim, Tag, X, Acc) ->
  1113. {T, "<" ++ Tag ++ ">" ++ htmlchars(reverse(Acc)) ++ X ++
  1114. "</" ++ Tag ++ ">"};
  1115. interpolate([H | T], Delim, Tag, X, Acc) ->
  1116. interpolate(T, Delim, Tag, X, [H | Acc]).
  1117. %% interpolate two is for double delimiters...
  1118. interpolate2([], Delim, _Tag, _X, Acc) ->
  1119. {[], [Delim] ++ [Delim] ++ htmlchars(reverse(Acc))};
  1120. interpolate2([Delim, Delim | T], Delim, Tag, X, Acc) ->
  1121. {T, "<" ++ Tag ++ ">" ++ htmlchars(reverse(Acc)) ++ X ++
  1122. "</" ++ Tag ++ ">"};
  1123. interpolate2([H | T], Delim, Tag, X, Acc) ->
  1124. interpolate2(T, Delim, Tag, X, [H | Acc]).
  1125. %% interpolate three is for double delimiters...
  1126. interpolate3([], D, _Tag1, Tag2, _X, Acc) ->
  1127. {[], "<" ++ Tag2 ++ ">" ++ [D] ++ "</" ++ Tag2 ++ ">"
  1128. ++ htmlchars(reverse(Acc))};
  1129. interpolate3([D, D, D | T], D, Tag1, Tag2, _X, Acc) ->
  1130. {T, "<" ++ Tag1 ++ ">" ++ "<" ++ Tag2 ++ ">"
  1131. ++ htmlchars(reverse(Acc)) ++ "</" ++ Tag2 ++ ">"
  1132. ++ "</" ++ Tag1 ++ ">"};
  1133. interpolate3([H | T], D, Tag1, Tag2, X, Acc) ->
  1134. interpolate3(T, D, Tag1, Tag2, X, [H | Acc]).
  1135. make_img_tag(Url, Acc, Title) ->
  1136. {tags, "<img src=\"" ++ Url ++ "\""
  1137. ++ " alt=\"" ++ Acc ++ "\""
  1138. ++ " title=\"" ++ Title ++ "\""
  1139. ++ " />"}.
  1140. %%%-------------------------------------------------------------------
  1141. %%%
  1142. %%% Unit Tests
  1143. %%%
  1144. %%%-------------------------------------------------------------------
  1145. % -include("markdown_tests.hrl").