/src/markdown.erl

http://github.com/lethain/erlang_markdown · Erlang · 397 lines · 299 code · 24 blank · 74 comment · 1 complexity · a40d72353bd9962a6b499c148617e202 MD5 · raw file

  1. %% Copyright (c) 2009 Will Larson <lethain@gmail.com>
  2. %% <insert MIT License here>
  3. %% @todo support for horizontal rule
  4. %% @todo support for secondary title syntax "Title\n====="
  5. %% @todo support for multi-level indentation
  6. -module(markdown).
  7. -author("Will Larson <lethain@gmail.com>").
  8. -version("0.0.2").
  9. -export([markdown/1]).
  10. -export([line_start/5, single_line/5]).
  11. -export([trim_whitespace/2, preserve_line/5, start_of_next_line/1, starts_with_number/1, remove_top_tag/3]).
  12. -export([identify_line_type/1]).
  13. -export([toggle_tag/3, exclusive_insert_tag/3]).
  14. -export([parse_link/2, parse_link_text/2, parse_link_remainder/2]).
  15. -define(DEBUG_LOGGER, fun(_X,_Y) -> ok end).
  16. %-define(DEBUG_LOGGER, fun(X,Y) -> io:format(X,Y) end).
  17. %%
  18. %% Primary Interface
  19. %%
  20. markdown(Text) when is_list(Text) ->
  21. markdown(list_to_binary(Text));
  22. markdown(Binary) when is_binary(Binary) ->
  23. line_start(Binary, [], [], [], []).
  24. %%
  25. %% Multi-line Entities
  26. %%
  27. identify_line_type(<<"">>) -> {empty_line, <<"">>};
  28. identify_line_type(<<"\n", Binary/binary>>) -> {empty_line, Binary};
  29. %identify_line_type(<<"- -", Binary/binary>>) -> {hr, Binary};
  30. %identify_line_type(<<"--", Binary/binary>>) -> {hr, Binary};
  31. identify_line_type(<<"- ", Binary/binary>>) -> {ul, Binary};
  32. identify_line_type(<<" - ", Binary/binary>>) -> {deep_ul, Binary};
  33. identify_line_type(<<"* ", Binary/binary>>) -> {ul, Binary};
  34. identify_line_type(<<" * ", Binary/binary>>) -> {deep_ul, Binary};
  35. identify_line_type(<<">> ", Binary/binary>>) -> {blockquote, Binary};
  36. identify_line_type(<<"> ", Binary/binary>>) -> {blockquote, Binary};
  37. identify_line_type(<<" ", Binary/binary>>) ->
  38. case starts_with_number(Binary) of
  39. {true, Binary2} ->
  40. {deep_ol, Binary2};
  41. false ->
  42. {pre, Binary}
  43. end;
  44. identify_line_type(<<Binary/binary>>) ->
  45. case starts_with_number(Binary) of
  46. {true, Binary2} ->
  47. {ol, Binary2};
  48. false ->
  49. {p, Binary}
  50. end.
  51. %% Manages closing multi-line entities.
  52. line_start(<<Binary/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  53. ?DEBUG_LOGGER("line_start: ~p~n",[Binary]),
  54. % calculate the expected indentation depth based on current stack
  55. IndentDepth0 = lists:foldr(fun(Elem, Depth) ->
  56. case Elem of
  57. <<"ol">> -> Depth + 1;
  58. <<"ul">> -> Depth + 1;
  59. _ -> Depth
  60. end end, 0, MultiContext),
  61. IndentDepth = erlang:max(IndentDepth0-1,0),
  62. % trim whitespace based on indent depth, 4 spaces per indentation depth
  63. % restrict trimming to avoid capturing pre blocks and signifigant whitespace
  64. % from within pre blocks
  65. {Binary2, Offset} = trim_whitespace(Binary, 4*IndentDepth),
  66. ?DEBUG_LOGGER("~p => ~p, IndentDepth = ~p, offset ~p~n", [Binary, Binary2, IndentDepth, Offset]),
  67. % close an appropriate number of ol/ul blocks when
  68. % the amount of trimmed whitespace is inadequate for
  69. % the current indentation depth
  70. CloseDepthBy = IndentDepth - trunc(Offset/4),
  71. {MultiContext2, Acc3} = lists:foldr(fun(_, {Tags, Acc0}) ->
  72. {Tags2, Acc2} = remove_top_tag([<<"li">>], Tags, Acc0),
  73. remove_top_tag([<<"ul">>, <<"ol">>], Tags2, Acc2)
  74. end, {MultiContext, Acc}, lists:seq(1,CloseDepthBy)),
  75. {Type, Binary3} = identify_line_type(Binary2),
  76. ?DEBUG_LOGGER("type (~p) and stack (~p) for ~p~n", [Type, MultiContext2, Binary]),
  77. {MultiContext3, Acc4} = case {Type, MultiContext2} of
  78. {empty_line, [<<"p">> | RestTags]} ->
  79. {RestTags, [<<"</p>">> | Acc3]};
  80. {empty_line, [<<"pre">> | RestTags]} ->
  81. {RestTags, [<<"</pre>">> | Acc3]};
  82. {empty_line, [<<"blockquote">> | RestTags]} ->
  83. {RestTags, [<<"</blockquote>">> | Acc3]};
  84. {empty_line, [<<"li">>, <<"ol">> | RestTags]} ->
  85. {RestTags, [<<"</ol>">>, <<"</li>">> | Acc3]};
  86. {empty_line, [<<"li">>, <<"ul">> | RestTags]} ->
  87. {RestTags, [<<"</ul>">>, <<"</li>">> | Acc3]};
  88. {p, []} ->
  89. {[<<"p">>], [<<"<p>">> | Acc3]};
  90. {p, [<<"p">> | RestTags]} ->
  91. {[<<"p">> | RestTags], [<<" ">> | Acc3]};
  92. {p, [<<"li">> | RestTags]} ->
  93. {[<<"li">> | RestTags], [<<" ">> | Acc3]};
  94. {p, [Tag | RestTags]} ->
  95. case lists:member(Tag, [<<"pre">>, <<"blockquote">>]) of
  96. true ->
  97. {[<<"p">> | RestTags], [<<"<p>">>,<<"</",Tag/binary,">">> | Acc3]};
  98. false ->
  99. {[Tag | RestTags], Acc3}
  100. end;
  101. {pre, []} ->
  102. {[<<"pre">>], [<<"<pre>">> | Acc3]};
  103. {pre, [<<"pre">> | RestTags]} ->
  104. {[<<"pre">> | RestTags], [<<"\n">> | Acc3]};
  105. {pre, [Tag | RestTags]} ->
  106. case lists:member(Tag, [<<"p">>, <<"blockquote">>]) of
  107. true ->
  108. {[<<"pre">> | RestTags], [<<"<pre>">>,<<"</",Tag/binary,">">> | Acc3]};
  109. false ->
  110. {[Tag | RestTags], Acc3}
  111. end;
  112. {blockquote, []} ->
  113. {[<<"blockquote">>], [<<"<blockquote>">> | Acc3]};
  114. {blockquote, [<<"blockquote">> | RestTags]} ->
  115. {[<<"blockquote">> | RestTags], [<<" ">> | Acc3]};
  116. {blockquote, [Tag | RestTags]} ->
  117. case lists:member(Tag, [<<"pre">>, <<"p">>]) of
  118. true ->
  119. {[<<"blockquote">> | RestTags], [<<"<blockquote>">>,<<"</",Tag/binary,">">> | Acc3]};
  120. false ->
  121. {[Tag | RestTags], Acc3}
  122. end;
  123. {deep_ul, [<<"li">> | RestTags]} ->
  124. {[ <<"li">>, <<"ul">>, <<"li">> | RestTags], [<<"<li>">>, <<"<ul>">> | Acc3]};
  125. {deep_ol, [<<"li">> | RestTags]} ->
  126. {[ <<"li">>, <<"ol">>, <<"li">> | RestTags], [<<"<li>">>, <<"<ol>">> | Acc3]};
  127. {ol, [<<"li">> | RestTags]} ->
  128. {[<<"li">> | RestTags], [<<"<li>">>, <<"</li>">> | Acc3]};
  129. {ol, RestTags} ->
  130. {[<<"li">>, <<"ol">> | RestTags], [<<"<li>">>, <<"<ol>">> | Acc3]};
  131. {ul, [<<"li">> | RestTags]} ->
  132. {[<<"li">> | RestTags], [<<"<li>">>, <<"</li>">> | Acc3]};
  133. {ul, RestTags} ->
  134. {[<<"li">>, <<"ul">> | RestTags], [<<"<li>">>, <<"<ul>">> | Acc3]};
  135. _ ->
  136. {MultiContext2, Acc3}
  137. end,
  138. case {Type, Binary3} of
  139. {empty_line, <<"">>} ->
  140. single_line(Binary3, OpenTags, Acc4, LinkContext, MultiContext3);
  141. {empty_line, _} ->
  142. line_start(Binary3, OpenTags, Acc4, LinkContext, MultiContext3);
  143. _ ->
  144. single_line(Binary3, OpenTags, Acc4, LinkContext, MultiContext3)
  145. end.
  146. %%
  147. %% Single Line Entities (headers, em, strong, links, code)
  148. %%
  149. %% Wrapup function, called at end of document.
  150. single_line(<<"">>, OpenTags, Acc, _LinkContext, MultiContext) ->
  151. Open = lists:reverse(lists:append([OpenTags, MultiContext])),
  152. ?DEBUG_LOGGER("remaining_tags: ~p~n", [Open]),
  153. ClosedTags = lists:foldr(fun(Tag, Acc2) ->
  154. [<<"</",Tag/binary,">">> | Acc2]
  155. end, Acc, Open),
  156. % markdown is gathered in reverse order
  157. Reversed = lists:reverse(ClosedTags),
  158. %list_to_binary(lists:append(lists:map(fun(X) -> binary_to_list(X) end, Reversed)));
  159. lists:foldr(fun(X,Acc2) -> <<Acc2/binary, X/binary>> end, <<"">>, Reversed);
  160. %% Pass control to multi-line entity handler when
  161. %% encountering new-line.
  162. single_line(<<" \n", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  163. line_start(Rest, OpenTags, [<<"<br>">> | Acc], LinkContext, MultiContext);
  164. single_line(<<"\n", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  165. line_start(Rest, OpenTags, Acc, LinkContext, MultiContext);
  166. single_line(<<"#####", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  167. {OpenTags2, Acc2} = exclusive_insert_tag(<<"h5">>, OpenTags, Acc),
  168. single_line(Rest, OpenTags2, Acc2, LinkContext, MultiContext);
  169. single_line(<<"####", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  170. {OpenTags2, Acc2} = exclusive_insert_tag(<<"h4">>, OpenTags, Acc),
  171. single_line(Rest, OpenTags2, Acc2, LinkContext, MultiContext);
  172. single_line(<<"###", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  173. {OpenTags2, Acc2} = exclusive_insert_tag(<<"h3">>, OpenTags, Acc),
  174. single_line(Rest, OpenTags2, Acc2, LinkContext, MultiContext);
  175. single_line(<<"##", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  176. {OpenTags2, Acc2} = exclusive_insert_tag(<<"h2">>, OpenTags, Acc),
  177. single_line(Rest, OpenTags2, Acc2, LinkContext, MultiContext);
  178. single_line(<<"#", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  179. {OpenTags2, Acc2} = exclusive_insert_tag(<<"h1">>, OpenTags, Acc),
  180. single_line(Rest, OpenTags2, Acc2, LinkContext, MultiContext);
  181. single_line(<<"**", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  182. {OpenTags2, Acc2} = toggle_tag(<<"strong">>, OpenTags, Acc),
  183. single_line(Rest, OpenTags2, Acc2, LinkContext, MultiContext);
  184. single_line(<<"*", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  185. {OpenTags2, Acc2} = toggle_tag(<<"em">>, OpenTags, Acc),
  186. single_line(Rest, OpenTags2, Acc2, LinkContext, MultiContext);
  187. single_line(<<"``", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  188. {OpenTags2, Acc2} = toggle_tag(<<"code">>, OpenTags, Acc),
  189. single_line(Rest, OpenTags2, Acc2, LinkContext, MultiContext);
  190. single_line(<<"`", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  191. {OpenTags2, Acc2} = toggle_tag(<<"code">>, OpenTags, Acc),
  192. single_line(Rest, OpenTags2, Acc2, LinkContext, MultiContext);
  193. single_line(<<"![", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  194. case parse_link(<<"[", Rest/binary>>, LinkContext) of
  195. {link, Rest2, Href, Text, []} ->
  196. Img = <<"<img src=\"", Href/binary, "\" alt=\"", Text/binary, "\">">>,
  197. single_line(Rest2, OpenTags, [Img | Acc], LinkContext, MultiContext);
  198. {link, Rest2, Href, Text, Title} ->
  199. Img = <<"<img src=\"", Href/binary, "\" alt=\"", Text/binary, "\" title=\"", Title/binary, "\">">>,
  200. single_line(Rest2, OpenTags, [Img | Acc], LinkContext, MultiContext)
  201. end;
  202. single_line(<<"[", Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  203. case parse_link(<<"[" , Rest/binary>>, LinkContext) of
  204. {link, Rest2, Href, Text, <<"">>} ->
  205. Link = <<"<a href=\"", Href/binary, "\">", Text/binary, "</a>">>,
  206. single_line(Rest2, OpenTags, [Link | Acc], LinkContext, MultiContext);
  207. {link, Rest2, Href, Text, Title} ->
  208. Link = <<"<a href=\"", Href/binary, "\" title=\"", Title/binary, "\">", Text/binary, "</a>">>,
  209. single_line(Rest2, OpenTags, [Link | Acc], LinkContext, MultiContext);
  210. {context, Rest2, LinkContext2} ->
  211. single_line(Rest2, OpenTags, Acc, LinkContext2, MultiContext)
  212. end;
  213. single_line(<<B:1/binary, Rest/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  214. single_line(Rest, OpenTags, [B | Acc], LinkContext, MultiContext).
  215. %%
  216. %% Utility functions (parsing links, managing tags, etc)
  217. %%
  218. %% @doc sub-parser for handling links.
  219. %% Handles formats:
  220. %% [This is a test](http://test.com/ "The title")
  221. %% [This is a test][test]
  222. %%
  223. %% For the second format, you'll need to have previously
  224. %% specified the link using the format
  225. %% [test]: http://test.com/ "Test Title"
  226. %%
  227. %% @spec parse_link() -> link_components() | new_context()
  228. %% new_context = {context, binary(), proplist()}
  229. %% link_components = {link, binary(), href(), text(), title()}
  230. %% href = string()
  231. %% text = string()
  232. %% title = string() | undefined
  233. %% proplist = [{binary(), binary()}]
  234. parse_link(Binary, LinkContext) ->
  235. {Binary2, Text} = parse_link_text(Binary,[]),
  236. case Binary2 of
  237. <<"(", Binary3/binary>> ->
  238. {Binary4, Link, Title} = parse_link_remainder(Binary3, <<")">>),
  239. {link, Binary4, Link, Text, Title};
  240. <<"[",Binary3/binary>> ->
  241. {Binary4, Reference} = parse_link_text(<<"[",Binary3/binary>>, []),
  242. case proplists:get_value(Reference, LinkContext) of
  243. {Link, Title} ->
  244. {link, Binary4, Link, Text, Title};
  245. _ ->
  246. {syntax_error, reference_to_undeclared_link_definition}
  247. end;
  248. <<":",Binary3/binary>> ->
  249. {Binary4, Link, Title} = parse_link_remainder(Binary3, <<"\n">>),
  250. {context, Binary4, [{Text, {Link, Title}} | LinkContext]};
  251. <<_:1/binary, _Binary3/binary>> ->
  252. {syntax_error, expected_paren_bracket_or_colon}
  253. end.
  254. %% @doc parse the text portion of a link.
  255. %% For example, parse "test" from [test][this].
  256. parse_link_text(<<"\n", _Binary/binary>>, _Acc) ->
  257. {syntax_error, unexpected_newline_in_link};
  258. parse_link_text(<<"[",Binary/binary>>, Acc) ->
  259. parse_link_text(Binary, Acc);
  260. parse_link_text(<<"]",Binary/binary>>, Acc) ->
  261. Reversed = lists:reverse(Acc),
  262. Text = lists:append(lists:map(fun(X) -> binary_to_list(X) end, Reversed)),
  263. {Binary, list_to_binary(Text)};
  264. parse_link_text(<<Char:1/binary, Binary/binary>>, Acc) ->
  265. parse_link_text(Binary, [Char | Acc]).
  266. %% @doc Parse the link and title out of Markdown link remainder.
  267. %% 'http:/test/ "this"' has link of "http:/test/" and title of "this"
  268. %%
  269. %% Example:
  270. %% {Binary4, Link, Title} = parse_link_remainder(Binary3, <<")">>),
  271. parse_link_remainder(<<Binary/binary>>, <<EndChar:1/binary>>) ->
  272. parse_link_remainder(Binary, EndChar, [], [], link).
  273. parse_link_remainder(<<EndChar:1/binary>>, EndChar, LinkAcc, TitleAcc, _) ->
  274. Link = list_to_binary(lists:append(lists:map(fun(X) -> binary_to_list(X) end, lists:reverse(LinkAcc)))),
  275. Title = list_to_binary(lists:append(lists:map(fun(X) -> binary_to_list(X) end, lists:reverse(TitleAcc)))),
  276. {<<"">>, Link, Title};
  277. parse_link_remainder(<<EndChar:1/binary, Binary/binary>>, EndChar, LinkAcc, TitleAcc, _) ->
  278. Link = list_to_binary(lists:append(lists:map(fun(X) -> binary_to_list(X) end, lists:reverse(LinkAcc)))),
  279. Title = list_to_binary(lists:append(lists:map(fun(X) -> binary_to_list(X) end, lists:reverse(TitleAcc)))),
  280. {Binary, Link, Title};
  281. parse_link_remainder(<<"\"", Binary/binary>>, EndChar, LinkAcc, TitleAcc, title) ->
  282. parse_link_remainder(<<Binary/binary>>, EndChar, LinkAcc, TitleAcc, done);
  283. parse_link_remainder(<<"\"", Binary/binary>>, EndChar, LinkAcc, [], link) ->
  284. parse_link_remainder(Binary, EndChar, LinkAcc, [], title);
  285. parse_link_remainder(<<" ", Binary/binary>>, EndChar, LinkAcc, [], link) ->
  286. parse_link_remainder(Binary, EndChar, LinkAcc, [], link);
  287. parse_link_remainder(<<Char:1/binary, Binary/binary>>, EndChar, LinkAcc, [], link) ->
  288. parse_link_remainder(Binary, EndChar, [Char | LinkAcc], [], link);
  289. parse_link_remainder(<<"\"", Binary/binary>>, EndChar, LinkAcc, TitleAcc, title) ->
  290. parse_link_remainder(Binary, EndChar, LinkAcc, TitleAcc, done);
  291. parse_link_remainder(<<Char:1/binary, Binary/binary>>, EndChar, LinkAcc, TitleAcc, title) ->
  292. parse_link_remainder(Binary, EndChar, LinkAcc, [Char | TitleAcc], title).
  293. %% @doc remove all whitespace from a newline, returns count
  294. %% of whitespace and trimmed binary.
  295. trim_whitespace(<<Binary/binary>>, Max) ->
  296. trim_whitespace(Binary, 0, Max).
  297. trim_whitespace(<<Binary/binary>>, Max, Max) ->
  298. {Binary, Max};
  299. trim_whitespace(<<" ", Binary/binary>>, Offset, Max) ->
  300. trim_whitespace(Binary, Offset+1, Max);
  301. trim_whitespace(<<Binary/binary>>, Offset, _Max) ->
  302. {Binary, Offset}.
  303. %% @doc close a tag if it is in the open tags stack,
  304. %% otherwise open it.
  305. %% @spec toggle_tag(tag(), tag_stack(), html()) -> {tag_stack(), html()}
  306. %% tag = binary()
  307. %% tag_stack = [tag()]
  308. %% html = [binary()]
  309. toggle_tag(Tag, OpenTags, Acc) ->
  310. case lists:member(Tag, OpenTags) of
  311. true ->
  312. {lists:delete(Tag, OpenTags), [<<"</",Tag/binary,">">> | Acc]};
  313. false ->
  314. {[Tag | OpenTags], [<<"<",Tag/binary,">">> | Acc]}
  315. end.
  316. %% @doc insert tag IFF it isn't already on the
  317. %% stack of open tags.
  318. %% @spec exclusive_insert_tag(tag(), tag_stack(), html()) -> {tag_stack(), html()}
  319. %% tag = binary()
  320. %% tag_stack = [tag()]
  321. %% html = [binary()]
  322. exclusive_insert_tag(Tag, OpenTags, Acc) ->
  323. case lists:member(Tag,OpenTags) of
  324. true ->
  325. {OpenTags, Acc};
  326. false ->
  327. {[Tag | OpenTags], [<<"<",Tag/binary,">">> | Acc]}
  328. end.
  329. %% @doc consume an entire line as is without modification
  330. preserve_line(<<"">>, OpenTags, Acc, LinkContext, MultiContext) ->
  331. line_start(<<"">>, OpenTags, Acc, LinkContext, MultiContext);
  332. preserve_line(<<"\n",Binary/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  333. line_start(Binary, OpenTags, [<<"\n">> | Acc], LinkContext, MultiContext);
  334. preserve_line(<<Char:1/binary,Binary/binary>>, OpenTags, Acc, LinkContext, MultiContext) ->
  335. preserve_line(Binary, OpenTags, [Char | Acc], LinkContext, MultiContext).
  336. %% @doc skip remainder of line
  337. start_of_next_line(<<"">>) ->
  338. <<"">>;
  339. start_of_next_line(<<"\n", Binary/binary>>) ->
  340. Binary;
  341. start_of_next_line(<<_Char:1/binary, Binary/binary>>) ->
  342. start_of_next_line(Binary).
  343. %% @doc determine if line starts with a number
  344. starts_with_number(<<Binary/binary>>) ->
  345. ?DEBUG_LOGGER("starts_with_number: ~p~n", [Binary]),
  346. starts_with_number(Binary, []).
  347. starts_with_number(<<"">>, []) ->
  348. false;
  349. starts_with_number(<<".", _Binary/binary>>, []) ->
  350. false;
  351. starts_with_number(<<". ", Binary/binary>>, _Acc) ->
  352. {true, Binary};
  353. starts_with_number(<<Char:1/binary, Binary/binary>>, Acc) ->
  354. try
  355. _Integer = list_to_integer(binary_to_list(Char)),
  356. starts_with_number(Binary, [Char | Acc])
  357. catch
  358. _:_ ->
  359. false
  360. end.
  361. %% @doc remove the first occurance of any of the tags in ToRemove.
  362. remove_top_tag(ToRemove, Tags, Html) ->
  363. {_, Tags3, Html3} = lists:foldr(fun(X, {Done, Acc, Html2}) ->
  364. case {Done, lists:member(X, ToRemove)} of
  365. {true, _} ->
  366. {true, [X | Acc], Html2};
  367. {false, true} ->
  368. {true, Acc, [<<"</",X/binary,">">> | Html2]};
  369. {false, false} ->
  370. {false, [X | Acc], Html2}
  371. end
  372. end, {false,[],Html}, Tags),
  373. {Tags3, Html3}.