PageRenderTime 63ms CodeModel.GetById 2ms app.highlight 57ms RepoModel.GetById 1ms app.codeStats 0ms

/src/markdown/z_html2markdown.erl

https://code.google.com/p/zotonic/
Erlang | 306 lines | 227 code | 55 blank | 24 comment | 3 complexity | 588293a2125a361ac3519b7018f759a2 MD5 | raw file
  1%% @author Marc Worrell <marc@worrell.nl>
  2%% @copyright 2011 Marc Worrell
  3
  4%% @doc Convert a html text to markdown syntax.
  5%%      This is used when editing TinyMCE texts with the markdown editor.
  6
  7%% Copyright 2011 Marc Worrell
  8%%
  9%% Licensed under the Apache License, Version 2.0 (the "License");
 10%% you may not use this file except in compliance with the License.
 11%% You may obtain a copy of the License at
 12%% 
 13%%     http://www.apache.org/licenses/LICENSE-2.0
 14%% 
 15%% Unless required by applicable law or agreed to in writing, software
 16%% distributed under the License is distributed on an "AS IS" BASIS,
 17%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18%% See the License for the specific language governing permissions and
 19%% limitations under the License.
 20
 21
 22-module(z_html2markdown).
 23-author("Marc Worrell <marc@worrell.nl>").
 24
 25-export([
 26    convert/1,
 27    convert/2
 28]).
 29
 30-include("zotonic.hrl").
 31
 32% Accumulated markdown state (tree walker)
 33-record(md, {a=[]}).
 34
 35% Recursive context dependent markdown state (context)
 36-record(ms, {li=none, indent=[], allow_html=true}).
 37
 38-compile({no_auto_import,[max/2]}).
 39
 40convert(Html) ->
 41    convert(Html, []).
 42
 43%% @doc Convert a html text to markdown format. Assumes the html has been sanitized and normalized.
 44convert(Html, Options) when is_binary(Html) ->
 45    convert1(<<"<sanitize>",Html/binary,"</sanitize>">>, Options);
 46convert(Html, Options) when is_list(Html) ->
 47    convert1(iolist_to_binary(["<sanitize>", Html, "</sanitize>"]), Options).
 48
 49convert1(Html, Options) ->
 50    Parsed = mochiweb_html:parse(Html),
 51    {Text, M} = to_md(Parsed, #md{}, set_options(Options, #ms{})),
 52    list_to_binary([trimnl(iolist_to_binary(Text)), expand_anchors(M)]).
 53
 54    set_options([], S) ->
 55        S;
 56    set_options([no_html|T], S) ->
 57        set_options(T, S#ms{allow_html=false}).
 58
 59
 60to_md(B, M, _S) when is_binary(B) ->
 61    {escape_html_text(B, <<>>), M};
 62to_md({comment, _Text}, M, _S) ->
 63    {<<>>, M};
 64
 65to_md({<<"h1">>, _Args, Enclosed}, M, S) ->
 66    header($=, Enclosed, M, S);
 67to_md({<<"h2">>, _Args, Enclosed}, M, S) ->
 68    header($-, Enclosed, M, S);
 69to_md({<<"h",N>>, _Args, Enclosed}, M, S) when N >= $1, N =< $6 ->
 70    {EncText, M1} = to_md(Enclosed, M, S),
 71    {[nl(S), nl(S), lists:duplicate(N-$0, "#"), 32, EncText, nl(S), nl(S)], M1};
 72
 73to_md({<<"hr">>, [], []}, M, S) ->
 74    {[nl(S), nl(S), <<"---">>, nl(S), nl(S)], M};
 75
 76to_md({<<"br">>, [], []}, M, S) ->
 77    {[32, 32, nl(S)], M};
 78to_md({<<"em">>, _Args, Enclosed}, M, S) ->
 79    {EncText, M1} = to_md(Enclosed, M, S),
 80    {[$*, trl(EncText), $*], M1};
 81to_md({<<"i">>, _Args, Enclosed}, M, S) ->
 82    {EncText, M1} = to_md(Enclosed, M, S),
 83    {[$*, trl(EncText), $*], M1};
 84to_md({<<"strong">>, _Args, Enclosed}, M, S) ->
 85    {EncText, M1} = to_md(Enclosed, M, S),
 86    {[$*, $*, trl(EncText), $*, $*], M1};
 87to_md({<<"b">>, _Args, Enclosed}, M, S) ->
 88    {EncText, M1} = to_md(Enclosed, M, S),
 89    {[$*, $*, trl(EncText), $*, $*], M1};
 90
 91to_md({<<"p">>, _Args, Enclosed}, M, S) ->
 92    {EncText, M1} = to_md(Enclosed, M, S),
 93    {[trl(EncText), nl(S), nl(S)], M1};
 94
 95to_md({<<"a">>, Args, Enclosed}, M, S) ->
 96    case proplists:get_value(<<"href">>, Args) of
 97        undefined ->
 98            to_md(Enclosed, M, S);
 99        Href ->
100            {EncText, M1} = to_md(Enclosed, M, S),
101            {M2,RefNr} = add_anchor(Href, M1),
102            {[ $[, trl(EncText), $],$[,integer_to_list(RefNr),$] ], M2}
103    end;
104    
105to_md({<<"code">>, _Args, Enclosed}, M, S) ->
106    {EncText, M1} = to_md(Enclosed, M, S),
107    {[$`, z_string:trim(EncText), $`], M1};
108to_md({<<"pre">>, _Args, [{<<"code">>, _, Enclosed}]}, M, S) ->
109    S1 = S#ms{indent=[code|S#ms.indent]},
110    {EncText, M1} = to_md(Enclosed, M, S1),
111    {[nl(S1), trl(EncText), nl(S)], M1};
112to_md({<<"pre">>, _Args, Enclosed}, M, S) ->
113    S1 = S#ms{indent=[code|S#ms.indent]},
114    {EncText, M1} = to_md(Enclosed, M, S1),
115    {[nl(S1), trl(EncText), nl(S)], M1};
116
117to_md({<<"quote">>, _Args, Enclosed}, M, S) ->
118    S1 = S#ms{indent=[quote|S#ms.indent]},
119    {EncText, M1} = to_md(Enclosed, M, S1),
120    {[nl(S1), trl(EncText), nl(S)], M1};
121
122to_md({<<"ul">>, _Args, Enclosed}, M, S) ->
123    {EncText, M1} = to_md(Enclosed, M, S#ms{li=ul}),
124    {[nl(S), trl(EncText), nl(S)], M1};
125to_md({<<"ol">>, _Args, Enclosed}, M, S) ->
126    {EncText, M1} = to_md(Enclosed, M, S#ms{li=ol}),
127    {[nl(S), trl(EncText), nl(S)], M1};
128to_md({<<"li">>, _Args, Enclosed}, M, S) ->
129    Bullet = case S#ms.li of
130                ol -> "1.  ";
131                ul -> "*   "
132             end,
133    {EncText, M1} = to_md(Enclosed, M, S#ms{li=none, indent=[S#ms.li|S#ms.indent]}),
134    {[nl(S), Bullet, 32, trl(EncText)], M1};
135
136to_md({<<"table">>, _Args, _Enclosed} = Html, M, S) when S#ms.allow_html ->
137    {flatten_html(Html), M};
138
139to_md({<<"head">>, _Args, _Enclosed}, M, _S) ->
140    {[], M};
141to_md({<<"script">>, _Args, _Enclosed}, M, _S) ->
142    {[], M};
143
144to_md({_, _, Enclosed}, M, S) ->
145    to_md(Enclosed, M, S);
146to_md(L, M, S) when is_list(L) ->
147    lists:foldl(fun(Elt,{AT,AM}) -> 
148                    {AT1,AM1} = to_md(Elt, AM, S),
149                    {AT++[AT1], AM1}
150                end, {[], M}, L).
151
152
153header(Char, Enclosed, M, S) ->
154    {EncText, M1} = to_md(Enclosed, M, S),
155    Trimmed = trl(EncText),
156    case trl(EncText) of
157        [] ->
158            {[], M1};
159        <<>> ->
160            {[], M1};
161        Trimmed ->
162            {[nl(S), nl(S), Trimmed, nl(S), lists:duplicate(max(len(Trimmed), 3), [Char]), nl(S), nl(S)], M1}
163    end.
164
165
166max(A,B) when A > B -> A;
167max(_A,B) -> B.
168
169
170nl(#ms{indent=[]}) ->
171    $\n;
172nl(#ms{indent=Indent}) ->
173    nl1(Indent, []).
174
175    nl1([], Acc) -> 
176        [$\n|Acc];
177    nl1([ul|Rest], Acc) -> 
178        nl1(Rest, ["   "|Acc]);
179    nl1([ol|Rest], Acc) -> 
180        nl1(Rest, ["    "|Acc]);
181    nl1([code|Rest], Acc) -> 
182        nl1(Rest, ["    "|Acc]);
183    nl1([quote|Rest], Acc) -> 
184        nl1(Rest, ["> "|Acc]).
185
186
187%% @doc Simple recursive length of an iolist
188len(EncText) when is_binary(EncText) ->
189    size(EncText);
190len(N) when is_integer(N) ->
191    1;
192len([H|L]) ->
193    len(H) + len(L);
194len([]) ->
195    0.
196
197
198
199%% @doc Escape pointy brackets, single and double quotes in texts (ampersand is already removed or escaped).
200escape_html_text(<<>>, Acc) -> 
201    Acc;
202escape_html_text(<<${, T/binary>>, Acc) ->
203    escape_html_text(T, <<Acc/binary, "\\{">>);
204escape_html_text(<<$}, T/binary>>, Acc) ->
205    escape_html_text(T, <<Acc/binary, "\\}">>);
206escape_html_text(<<$[, T/binary>>, Acc) ->
207    escape_html_text(T, <<Acc/binary, "\\[">>);
208escape_html_text(<<$], T/binary>>, Acc) ->
209    escape_html_text(T, <<Acc/binary, "\\]">>);
210escape_html_text(<<$_, T/binary>>, Acc) ->
211    escape_html_text(T, <<Acc/binary, "\\_">>);
212escape_html_text(<<$*, T/binary>>, Acc) ->
213    escape_html_text(T, <<Acc/binary, "\\*">>);
214escape_html_text(<<$`, T/binary>>, Acc) ->
215    escape_html_text(T, <<Acc/binary, "``">>);
216escape_html_text(<<$<, T/binary>>, Acc) ->
217    escape_html_text(T, <<Acc/binary, "&lt;">>);
218escape_html_text(<<$>, T/binary>>, Acc) ->
219    escape_html_text(T, <<Acc/binary, "&gt;">>);
220escape_html_text(<<$", T/binary>>, Acc) ->
221    escape_html_text(T, <<Acc/binary, "&quot;">>);
222escape_html_text(<<$', T/binary>>, Acc) ->
223    escape_html_text(T, <<Acc/binary, "&#39;">>);
224escape_html_text(<<32, T/binary>>, Acc) ->
225    escape_html_text(trl(T), <<Acc/binary, 32>>);
226escape_html_text(<<9, T/binary>>, Acc) ->
227    escape_html_text(trl(T), <<Acc/binary, 32>>);
228escape_html_text(<<$\n, T/binary>>, Acc) ->
229    escape_html_text(trl(T), <<Acc/binary, 32>>);
230escape_html_text(<<C, T/binary>>, Acc) ->
231    escape_html_text(T, <<Acc/binary, C>>).
232
233%% @doc Escape pointy brackets (for in comments)
234escape_html_comment(<<>>, Acc) -> 
235    Acc;
236escape_html_comment(<<$<, T/binary>>, Acc) ->
237    escape_html_comment(T, <<Acc/binary, "&lt;">>);
238escape_html_comment(<<$>, T/binary>>, Acc) ->
239    escape_html_comment(T, <<Acc/binary, "&gt;">>);
240escape_html_comment(<<C, T/binary>>, Acc) ->
241    escape_html_comment(T, <<Acc/binary, C>>).
242
243
244trimnl(<<$\n, Rest/binary>>) ->
245    trimnl(Rest);
246trimnl(B) ->
247    B.
248
249trl(B) ->
250    z_string:trim_left(B).
251
252
253
254% @todo: check if the Href is already defined, if so return existing index
255add_anchor(Href, M) ->
256    case indexof(Href, M#md.a, 1) of
257        undefined ->
258            {M#md{a=M#md.a ++ [Href]}, length(M#md.a)+1};
259        N ->
260            {M, N}
261    end.
262    
263    indexof(_A, [], _N) -> undefined;
264    indexof(A, [A|_], N) -> N;
265    indexof(A, [_|R], N) -> indexof(A, R, N+1).
266
267
268expand_anchors(#md{a = []}) ->
269    [];
270expand_anchors(#md{a = As}) ->
271    [10 | expand_anchor(As, 1, []) ].
272    
273    expand_anchor([], _, Acc) ->
274        lists:reverse(Acc);
275    expand_anchor([A|As], N, Acc) ->
276        Link = [ 32, 32, $[, integer_to_list(N), $], $:, 32, A, 10 ],
277        expand_anchor(As, N+1, [Link|Acc]).
278
279        
280
281flatten_html(Text) when is_binary(Text) ->
282    z_html:escape(Text);
283flatten_html({comment, _Text}) ->
284    [];
285flatten_html({Tag, Args, Enclosed}) ->
286    case Enclosed == [] andalso is_self_closing(Tag) of
287        true ->
288            [ $<, Tag, flatten_args(Args), $/, $> ];
289        false ->
290            [
291                $<, Tag, flatten_args(Args), $>,
292                [ flatten_html(Enc) || Enc <- Enclosed ],
293                $<, $/, Tag, $>
294            ]
295    end.
296    
297    is_self_closing(<<"img">>) -> true;
298    is_self_closing(<<"br">>) -> true;
299    is_self_closing(<<"hr">>) -> true;
300    is_self_closing(_) -> false.
301
302    flatten_args(Args) ->
303        [ flatten_arg(Arg) || Arg <- Args ].
304    
305    flatten_arg({Name, Value}) ->
306        [ 32, Name, $=, $", z_html:escape(Value), $" ].