/src/markdown/z_html2markdown.erl

https://code.google.com/p/zotonic/ · Erlang · 306 lines · 227 code · 55 blank · 24 comment · 3 complexity · 588293a2125a361ac3519b7018f759a2 MD5 · raw file

  1. %% @author Marc Worrell <marc@worrell.nl>
  2. %% @copyright 2011 Marc Worrell
  3. %% @doc Convert a html text to markdown syntax.
  4. %% This is used when editing TinyMCE texts with the markdown editor.
  5. %% Copyright 2011 Marc Worrell
  6. %%
  7. %% Licensed under the Apache License, Version 2.0 (the "License");
  8. %% you may not use this file except in compliance with the License.
  9. %% You may obtain a copy of the License at
  10. %%
  11. %% http://www.apache.org/licenses/LICENSE-2.0
  12. %%
  13. %% Unless required by applicable law or agreed to in writing, software
  14. %% distributed under the License is distributed on an "AS IS" BASIS,
  15. %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. %% See the License for the specific language governing permissions and
  17. %% limitations under the License.
  18. -module(z_html2markdown).
  19. -author("Marc Worrell <marc@worrell.nl>").
  20. -export([
  21. convert/1,
  22. convert/2
  23. ]).
  24. -include("zotonic.hrl").
  25. % Accumulated markdown state (tree walker)
  26. -record(md, {a=[]}).
  27. % Recursive context dependent markdown state (context)
  28. -record(ms, {li=none, indent=[], allow_html=true}).
  29. -compile({no_auto_import,[max/2]}).
  30. convert(Html) ->
  31. convert(Html, []).
  32. %% @doc Convert a html text to markdown format. Assumes the html has been sanitized and normalized.
  33. convert(Html, Options) when is_binary(Html) ->
  34. convert1(<<"<sanitize>",Html/binary,"</sanitize>">>, Options);
  35. convert(Html, Options) when is_list(Html) ->
  36. convert1(iolist_to_binary(["<sanitize>", Html, "</sanitize>"]), Options).
  37. convert1(Html, Options) ->
  38. Parsed = mochiweb_html:parse(Html),
  39. {Text, M} = to_md(Parsed, #md{}, set_options(Options, #ms{})),
  40. list_to_binary([trimnl(iolist_to_binary(Text)), expand_anchors(M)]).
  41. set_options([], S) ->
  42. S;
  43. set_options([no_html|T], S) ->
  44. set_options(T, S#ms{allow_html=false}).
  45. to_md(B, M, _S) when is_binary(B) ->
  46. {escape_html_text(B, <<>>), M};
  47. to_md({comment, _Text}, M, _S) ->
  48. {<<>>, M};
  49. to_md({<<"h1">>, _Args, Enclosed}, M, S) ->
  50. header($=, Enclosed, M, S);
  51. to_md({<<"h2">>, _Args, Enclosed}, M, S) ->
  52. header($-, Enclosed, M, S);
  53. to_md({<<"h",N>>, _Args, Enclosed}, M, S) when N >= $1, N =< $6 ->
  54. {EncText, M1} = to_md(Enclosed, M, S),
  55. {[nl(S), nl(S), lists:duplicate(N-$0, "#"), 32, EncText, nl(S), nl(S)], M1};
  56. to_md({<<"hr">>, [], []}, M, S) ->
  57. {[nl(S), nl(S), <<"---">>, nl(S), nl(S)], M};
  58. to_md({<<"br">>, [], []}, M, S) ->
  59. {[32, 32, nl(S)], M};
  60. to_md({<<"em">>, _Args, Enclosed}, M, S) ->
  61. {EncText, M1} = to_md(Enclosed, M, S),
  62. {[$*, trl(EncText), $*], M1};
  63. to_md({<<"i">>, _Args, Enclosed}, M, S) ->
  64. {EncText, M1} = to_md(Enclosed, M, S),
  65. {[$*, trl(EncText), $*], M1};
  66. to_md({<<"strong">>, _Args, Enclosed}, M, S) ->
  67. {EncText, M1} = to_md(Enclosed, M, S),
  68. {[$*, $*, trl(EncText), $*, $*], M1};
  69. to_md({<<"b">>, _Args, Enclosed}, M, S) ->
  70. {EncText, M1} = to_md(Enclosed, M, S),
  71. {[$*, $*, trl(EncText), $*, $*], M1};
  72. to_md({<<"p">>, _Args, Enclosed}, M, S) ->
  73. {EncText, M1} = to_md(Enclosed, M, S),
  74. {[trl(EncText), nl(S), nl(S)], M1};
  75. to_md({<<"a">>, Args, Enclosed}, M, S) ->
  76. case proplists:get_value(<<"href">>, Args) of
  77. undefined ->
  78. to_md(Enclosed, M, S);
  79. Href ->
  80. {EncText, M1} = to_md(Enclosed, M, S),
  81. {M2,RefNr} = add_anchor(Href, M1),
  82. {[ $[, trl(EncText), $],$[,integer_to_list(RefNr),$] ], M2}
  83. end;
  84. to_md({<<"code">>, _Args, Enclosed}, M, S) ->
  85. {EncText, M1} = to_md(Enclosed, M, S),
  86. {[$`, z_string:trim(EncText), $`], M1};
  87. to_md({<<"pre">>, _Args, [{<<"code">>, _, Enclosed}]}, M, S) ->
  88. S1 = S#ms{indent=[code|S#ms.indent]},
  89. {EncText, M1} = to_md(Enclosed, M, S1),
  90. {[nl(S1), trl(EncText), nl(S)], M1};
  91. to_md({<<"pre">>, _Args, Enclosed}, M, S) ->
  92. S1 = S#ms{indent=[code|S#ms.indent]},
  93. {EncText, M1} = to_md(Enclosed, M, S1),
  94. {[nl(S1), trl(EncText), nl(S)], M1};
  95. to_md({<<"quote">>, _Args, Enclosed}, M, S) ->
  96. S1 = S#ms{indent=[quote|S#ms.indent]},
  97. {EncText, M1} = to_md(Enclosed, M, S1),
  98. {[nl(S1), trl(EncText), nl(S)], M1};
  99. to_md({<<"ul">>, _Args, Enclosed}, M, S) ->
  100. {EncText, M1} = to_md(Enclosed, M, S#ms{li=ul}),
  101. {[nl(S), trl(EncText), nl(S)], M1};
  102. to_md({<<"ol">>, _Args, Enclosed}, M, S) ->
  103. {EncText, M1} = to_md(Enclosed, M, S#ms{li=ol}),
  104. {[nl(S), trl(EncText), nl(S)], M1};
  105. to_md({<<"li">>, _Args, Enclosed}, M, S) ->
  106. Bullet = case S#ms.li of
  107. ol -> "1. ";
  108. ul -> "* "
  109. end,
  110. {EncText, M1} = to_md(Enclosed, M, S#ms{li=none, indent=[S#ms.li|S#ms.indent]}),
  111. {[nl(S), Bullet, 32, trl(EncText)], M1};
  112. to_md({<<"table">>, _Args, _Enclosed} = Html, M, S) when S#ms.allow_html ->
  113. {flatten_html(Html), M};
  114. to_md({<<"head">>, _Args, _Enclosed}, M, _S) ->
  115. {[], M};
  116. to_md({<<"script">>, _Args, _Enclosed}, M, _S) ->
  117. {[], M};
  118. to_md({_, _, Enclosed}, M, S) ->
  119. to_md(Enclosed, M, S);
  120. to_md(L, M, S) when is_list(L) ->
  121. lists:foldl(fun(Elt,{AT,AM}) ->
  122. {AT1,AM1} = to_md(Elt, AM, S),
  123. {AT++[AT1], AM1}
  124. end, {[], M}, L).
  125. header(Char, Enclosed, M, S) ->
  126. {EncText, M1} = to_md(Enclosed, M, S),
  127. Trimmed = trl(EncText),
  128. case trl(EncText) of
  129. [] ->
  130. {[], M1};
  131. <<>> ->
  132. {[], M1};
  133. Trimmed ->
  134. {[nl(S), nl(S), Trimmed, nl(S), lists:duplicate(max(len(Trimmed), 3), [Char]), nl(S), nl(S)], M1}
  135. end.
  136. max(A,B) when A > B -> A;
  137. max(_A,B) -> B.
  138. nl(#ms{indent=[]}) ->
  139. $\n;
  140. nl(#ms{indent=Indent}) ->
  141. nl1(Indent, []).
  142. nl1([], Acc) ->
  143. [$\n|Acc];
  144. nl1([ul|Rest], Acc) ->
  145. nl1(Rest, [" "|Acc]);
  146. nl1([ol|Rest], Acc) ->
  147. nl1(Rest, [" "|Acc]);
  148. nl1([code|Rest], Acc) ->
  149. nl1(Rest, [" "|Acc]);
  150. nl1([quote|Rest], Acc) ->
  151. nl1(Rest, ["> "|Acc]).
  152. %% @doc Simple recursive length of an iolist
  153. len(EncText) when is_binary(EncText) ->
  154. size(EncText);
  155. len(N) when is_integer(N) ->
  156. 1;
  157. len([H|L]) ->
  158. len(H) + len(L);
  159. len([]) ->
  160. 0.
  161. %% @doc Escape pointy brackets, single and double quotes in texts (ampersand is already removed or escaped).
  162. escape_html_text(<<>>, Acc) ->
  163. Acc;
  164. escape_html_text(<<${, T/binary>>, Acc) ->
  165. escape_html_text(T, <<Acc/binary, "\\{">>);
  166. escape_html_text(<<$}, T/binary>>, Acc) ->
  167. escape_html_text(T, <<Acc/binary, "\\}">>);
  168. escape_html_text(<<$[, T/binary>>, Acc) ->
  169. escape_html_text(T, <<Acc/binary, "\\[">>);
  170. escape_html_text(<<$], T/binary>>, Acc) ->
  171. escape_html_text(T, <<Acc/binary, "\\]">>);
  172. escape_html_text(<<$_, T/binary>>, Acc) ->
  173. escape_html_text(T, <<Acc/binary, "\\_">>);
  174. escape_html_text(<<$*, T/binary>>, Acc) ->
  175. escape_html_text(T, <<Acc/binary, "\\*">>);
  176. escape_html_text(<<$`, T/binary>>, Acc) ->
  177. escape_html_text(T, <<Acc/binary, "``">>);
  178. escape_html_text(<<$<, T/binary>>, Acc) ->
  179. escape_html_text(T, <<Acc/binary, "&lt;">>);
  180. escape_html_text(<<$>, T/binary>>, Acc) ->
  181. escape_html_text(T, <<Acc/binary, "&gt;">>);
  182. escape_html_text(<<$", T/binary>>, Acc) ->
  183. escape_html_text(T, <<Acc/binary, "&quot;">>);
  184. escape_html_text(<<$', T/binary>>, Acc) ->
  185. escape_html_text(T, <<Acc/binary, "&#39;">>);
  186. escape_html_text(<<32, T/binary>>, Acc) ->
  187. escape_html_text(trl(T), <<Acc/binary, 32>>);
  188. escape_html_text(<<9, T/binary>>, Acc) ->
  189. escape_html_text(trl(T), <<Acc/binary, 32>>);
  190. escape_html_text(<<$\n, T/binary>>, Acc) ->
  191. escape_html_text(trl(T), <<Acc/binary, 32>>);
  192. escape_html_text(<<C, T/binary>>, Acc) ->
  193. escape_html_text(T, <<Acc/binary, C>>).
  194. %% @doc Escape pointy brackets (for in comments)
  195. escape_html_comment(<<>>, Acc) ->
  196. Acc;
  197. escape_html_comment(<<$<, T/binary>>, Acc) ->
  198. escape_html_comment(T, <<Acc/binary, "&lt;">>);
  199. escape_html_comment(<<$>, T/binary>>, Acc) ->
  200. escape_html_comment(T, <<Acc/binary, "&gt;">>);
  201. escape_html_comment(<<C, T/binary>>, Acc) ->
  202. escape_html_comment(T, <<Acc/binary, C>>).
  203. trimnl(<<$\n, Rest/binary>>) ->
  204. trimnl(Rest);
  205. trimnl(B) ->
  206. B.
  207. trl(B) ->
  208. z_string:trim_left(B).
  209. % @todo: check if the Href is already defined, if so return existing index
  210. add_anchor(Href, M) ->
  211. case indexof(Href, M#md.a, 1) of
  212. undefined ->
  213. {M#md{a=M#md.a ++ [Href]}, length(M#md.a)+1};
  214. N ->
  215. {M, N}
  216. end.
  217. indexof(_A, [], _N) -> undefined;
  218. indexof(A, [A|_], N) -> N;
  219. indexof(A, [_|R], N) -> indexof(A, R, N+1).
  220. expand_anchors(#md{a = []}) ->
  221. [];
  222. expand_anchors(#md{a = As}) ->
  223. [10 | expand_anchor(As, 1, []) ].
  224. expand_anchor([], _, Acc) ->
  225. lists:reverse(Acc);
  226. expand_anchor([A|As], N, Acc) ->
  227. Link = [ 32, 32, $[, integer_to_list(N), $], $:, 32, A, 10 ],
  228. expand_anchor(As, N+1, [Link|Acc]).
  229. flatten_html(Text) when is_binary(Text) ->
  230. z_html:escape(Text);
  231. flatten_html({comment, _Text}) ->
  232. [];
  233. flatten_html({Tag, Args, Enclosed}) ->
  234. case Enclosed == [] andalso is_self_closing(Tag) of
  235. true ->
  236. [ $<, Tag, flatten_args(Args), $/, $> ];
  237. false ->
  238. [
  239. $<, Tag, flatten_args(Args), $>,
  240. [ flatten_html(Enc) || Enc <- Enclosed ],
  241. $<, $/, Tag, $>
  242. ]
  243. end.
  244. is_self_closing(<<"img">>) -> true;
  245. is_self_closing(<<"br">>) -> true;
  246. is_self_closing(<<"hr">>) -> true;
  247. is_self_closing(_) -> false.
  248. flatten_args(Args) ->
  249. [ flatten_arg(Arg) || Arg <- Args ].
  250. flatten_arg({Name, Value}) ->
  251. [ 32, Name, $=, $", z_html:escape(Value), $" ].