PageRenderTime 78ms CodeModel.GetById 13ms app.highlight 58ms RepoModel.GetById 1ms app.codeStats 0ms

/src/support/z_html.erl

http://github.com/zotonic/zotonic
Erlang | 629 lines | 512 code | 61 blank | 56 comment | 13 complexity | 7372422d66bc019f560385c0932d8a73 MD5 | raw file
  1%% @author Marc Worrell <marc@worrell.nl>
  2%% @copyright 2009-2012 Marc Worrell
  3%% Date: 2009-04-17
  4%%
  5%% @doc Utility functions for html processing.  Also used for property filtering (by m_rsc_update).
  6
  7%% Copyright 2009-2012 Marc Worrell
  8%%
  9%% Licensed under the Apache License, Version 2.0 (the "License");
 10%% you may not use this file except in compliance with the License.
 11%% You may obtain a copy of the License at
 12%% 
 13%%     http://www.apache.org/licenses/LICENSE-2.0
 14%% 
 15%% Unless required by applicable law or agreed to in writing, software
 16%% distributed under the License is distributed on an "AS IS" BASIS,
 17%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18%% See the License for the specific language governing permissions and
 19%% limitations under the License.
 20
 21-module(z_html).
 22-author("Marc Worrell <marc@worrell.nl").
 23
 24%% interface functions
 25-export([
 26    escape_props/1,
 27    escape_props/2,
 28    escape/1,
 29    unescape/1,
 30    strip/1,
 31    sanitize/1,
 32    sanitize/2,
 33    noscript/1,
 34    escape_link/1,
 35    nl2br/1,
 36    scrape_link_elements/1,
 37    ensure_escaped_amp/1
 38]).
 39
 40-include_lib("zotonic.hrl").
 41
 42
 43%% @doc Escape all properties used for an update statement. Only leaves the body property intact.
 44%% @spec escape_props(PropertyList) -> PropertyList
 45escape_props(Props) ->
 46    escape_props1(Props, [], undefined).
 47
 48%% @spec escape_props(PropertyList, context()) -> PropertyList
 49escape_props(Props, Context) ->
 50    escape_props1(Props, [], Context).
 51
 52    escape_props1([], Acc, _OptContext) ->
 53        Acc;
 54    escape_props1([{_K,V} = Prop|T], Acc, OptContext) when is_float(V); is_integer(V); is_atom(V) -> 
 55        escape_props1(T, [Prop|Acc], OptContext);
 56    escape_props1([{K, V}|T], Acc, OptContext) when K =:= body orelse K =:= body_extra->
 57        escape_props1(T, [{K, sanitize(V, OptContext)} | Acc], OptContext);
 58    escape_props1([{K, V}|T], Acc, OptContext) ->
 59        EscapeFun = case lists:reverse(z_convert:to_list(K)) of
 60                        "lmth_" ++ _ -> fun(A) -> sanitize(A, OptContext) end; %% prop ends in '_html'
 61                        _ -> fun escape_value/1
 62                    end,
 63        escape_props1(T, [{K, EscapeFun(V)} | Acc], OptContext).
 64
 65    escape_value({trans, Texts}) ->
 66        {trans, escape_props(Texts)};
 67    escape_value(V) when is_list(V) ->
 68        try
 69            escape_value(iolist_to_binary(V))
 70        catch _:_ ->
 71            V
 72        end;
 73    escape_value(B) when is_binary(B) ->
 74        escape(B);
 75    escape_value(V) -> 
 76        V.
 77
 78
 79%% @doc Escape a string so that it is valid within HTML/ XML.
 80%% @spec escape(iolist()) -> binary()
 81escape({trans, Tr}) ->
 82    {trans, [{Lang, escape(V)} || {Lang,V} <- Tr]};
 83escape(undefined) -> 
 84    undefined;
 85escape(<<>>) -> 
 86    <<>>;
 87escape([]) ->
 88    <<>>;
 89escape(L) when is_list(L) ->
 90    escape(list_to_binary(L));
 91escape(B) when is_binary(B) ->
 92    escape(B, <<>>).
 93
 94    escape(<<>>, Acc) -> 
 95        Acc;
 96    escape(<<"&euro;", T/binary>>, Acc) ->
 97        escape(T, <<Acc/binary, "â&#x201A;?">>);
 98    escape(<<$&, T/binary>>, Acc) ->
 99        escape(T, <<Acc/binary, "&amp;">>);
100    escape(<<$<, T/binary>>, Acc) ->
101        escape(T, <<Acc/binary, "&lt;">>);
102    escape(<<$>, T/binary>>, Acc) ->
103        escape(T, <<Acc/binary, "&gt;">>);
104    escape(<<$", T/binary>>, Acc) ->
105        escape(T, <<Acc/binary, "&quot;">>);
106    escape(<<$', T/binary>>, Acc) ->
107        escape(T, <<Acc/binary, "&#39;">>);
108    escape(<<C, T/binary>>, Acc) ->
109        escape(T, <<Acc/binary, C>>).
110
111
112%% @doc Unescape - reverses the effect of escape.
113%% @spec unescape(iolist()) -> binary()
114unescape({trans, Tr}) ->
115    {trans, [{Lang, unescape(V)} || {Lang,V} <- Tr]};
116unescape(undefined) -> 
117    undefined;
118unescape(<<>>) -> 
119    <<>>;
120unescape([]) ->
121    <<>>;
122unescape(L) when is_list(L) ->
123    unescape(list_to_binary(L));
124unescape(B) when is_binary(B) ->
125    unescape(B, <<>>).
126
127    unescape(<<>>, Acc) -> 
128        Acc;
129    unescape(<<"&amp;", T/binary>>, Acc) ->
130        unescape(T, <<Acc/binary, "&">>);
131    unescape(<<"&quot;", T/binary>>, Acc) ->
132        unescape(T, <<Acc/binary, "\"">>);
133    unescape(<<"&#39;", T/binary>>, Acc) ->
134        unescape(T, <<Acc/binary, "'">>);
135    unescape(<<"&lt;", T/binary>>, Acc) ->
136        unescape(T, <<Acc/binary, "<">>);
137    unescape(<<"&gt;", T/binary>>, Acc) ->
138        unescape(T, <<Acc/binary, ">">>);
139    unescape(<<"&euro;", T/binary>>, Acc) ->
140        unescape(T, <<Acc/binary, "â&#x201A;?">>);
141    unescape(<<C, T/binary>>, Acc) ->
142        unescape(T, <<Acc/binary, C>>).
143
144
145%% @doc Escape a text. Expands any urls to links with a nofollow attribute.
146%% @spec escape_link(Text) -> binary()
147escape_link(undefined) ->
148    undefined;
149escape_link(<<>>) ->
150    <<>>;
151escape_link([]) ->
152    <<>>;
153escape_link(Text) ->
154    case re:run(Text, "\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^[:punct:]\\s]|/)))", [{capture, first, index}, global]) of
155        {match, Matches} ->
156            Matches1 = [ hd(M) || M <- Matches ],
157            nl2br(iolist_to_binary(make_links1(0, Matches1, z_convert:to_list(Text), [])));
158        nomatch ->
159            nl2br(escape(Text))
160    end.
161
162    make_links1(_Offset, [], Text, Acc) ->
163        lists:reverse([escape(Text) | Acc]);
164    make_links1(Offset, [{Offset, Len}|Rest], Text, Acc) ->
165        {Link, Text1} = lists:split(Len, Text),
166        NoScript = noscript(Link),
167        Link1 = escape(NoScript),
168        Link2 = escape(ensure_protocol(NoScript)),
169        make_links1(Offset+Len, Rest, Text1, [["<a href=\"",Link2,"\" rel=\"nofollow\">",Link1,"</a>"] | Acc]);
170    make_links1(Offset, [{MatchOffs,_}|_] = Matches, Text, Acc) ->
171        {Text1,Text2} = lists:split(MatchOffs-Offset, Text),
172        make_links1(MatchOffs, Matches, Text2, [escape(Text1)|Acc]).
173
174    ensure_protocol([]) ->
175        [];
176    ensure_protocol("#" ++ _ = Link) ->
177        Link;
178    ensure_protocol("www" ++ Rest) ->
179        ["http://www", Rest];
180    ensure_protocol(Link) ->
181        Link.
182
183
184
185%% @doc Strip all html elements from the text. Simple parsing is applied to find the elements. Does not escape the end result.
186%% @spec strip(iolist()) -> iolist()
187strip({trans, Tr}) ->
188    {trans, [{Lang, strip(V)} || {Lang,V} <- Tr]};
189strip(undefined) ->
190    [];
191strip(<<>>) ->
192    <<>>;
193strip([]) ->
194    [];
195strip(Html) when is_binary(Html) ->
196    strip(Html, in_text, <<>>);
197strip(L) when is_list(L) ->
198    strip(list_to_binary(L)).
199
200strip(<<>>, _, Acc) -> Acc;
201strip(<<$<,T/binary>>, in_text, Acc) ->
202    strip(T, in_tag, Acc);
203strip(<<$>,T/binary>>, in_tag, Acc) ->
204    strip(T, in_text, <<Acc/binary, 32>>);
205strip(<<$>,T/binary>>, State, Acc) ->
206    strip(T, State, Acc);
207strip(<<$<,T/binary>>, State, Acc) ->
208    strip(T, State, Acc);
209strip(<<$\\,_,T/binary>>, in_dstring, Acc) ->
210    strip(T, in_dstring, Acc);
211strip(<<$\\,_,T/binary>>, in_sstring, Acc) ->
212    strip(T, in_sstring, Acc);
213strip(<<$",T/binary>>, in_tag, Acc) ->
214    strip(T, in_dstring, Acc);
215strip(<<$",T/binary>>, in_dstring, Acc) ->
216    strip(T, in_tag, Acc);
217strip(<<$',T/binary>>, in_tag, Acc) ->
218    strip(T, in_sstring, Acc);
219strip(<<$',T/binary>>, in_sstring, Acc) ->
220    strip(T, in_tag, Acc);
221strip(<<H,T/binary>>, in_text, Acc) ->
222    strip(T, in_text, <<Acc/binary, H>>);
223strip(<<_,T/binary>>, State, Acc) ->
224    strip(T, State, Acc).
225
226
227%% @doc Sanitize a (X)HTML string. Remove elements and attributes that might be harmful.
228%% @spec sanitize(binary()) -> binary()
229sanitize(Html) ->
230    sanitize(Html, undefined).
231
232sanitize({trans, Tr}, OptContext) ->
233    {trans, [{Lang, sanitize(V, OptContext)} || {Lang,V} <- Tr]};
234sanitize(Html, OptContext) when is_binary(Html) ->
235    sanitize_opts(<<"<sanitize>",Html/binary,"</sanitize>">>, OptContext);
236sanitize(Html, OptContext) when is_list(Html) ->
237    sanitize_opts(iolist_to_binary(["<sanitize>", Html, "</sanitize>"]), OptContext).
238
239    sanitize_opts(Html, OptContext) ->
240        ExtraAttrs = case OptContext of
241                        #context{} -> 
242                            binstr:split(m_config:get_value(site, html_attr_extra, <<>>, OptContext), <<",">>);
243                        undefined ->
244                            []
245                     end,
246        ExtraElts =  case OptContext of
247                        #context{} -> 
248                             binstr:split(m_config:get_value(site, html_elt_extra, <<>>, OptContext), <<",">>);
249                        undefined ->
250                             []
251                     end,
252        sanitize1(Html, ExtraElts, ExtraAttrs, OptContext).
253
254sanitize1(Html, ExtraElts, ExtraAttrs, OptContext) ->
255    Parsed = mochiweb_html:parse(ensure_escaped_amp(Html)),
256    Sanitized = sanitize(Parsed, [], ExtraElts, ExtraAttrs, OptContext),
257    flatten(Sanitized).
258
259    sanitize(B, _Stack, _ExtraElts, _ExtraAttrs, _OptContext) when is_binary(B) ->
260        escape(B);
261    sanitize({comment, Text}, _Stack, _ExtraElts, _ExtraAttrs, _OptContext) ->
262        {comment, Text};
263    sanitize({pi, _Raw}, _Stack, _ExtraElts, _ExtraAttrs, _OptContext) ->
264        <<>>;
265    sanitize({pi, _Tag, _Attrs}, _Stack, _ExtraElts, _ExtraAttrs, _OptContext) ->
266        <<>>;
267    sanitize({Elt,Attrs,Enclosed}, Stack, ExtraElts, ExtraAttrs, OptContext) ->
268        Lower = list_to_binary(z_string:to_lower(Elt)),
269        case allow_elt(Lower, ExtraElts) orelse (not lists:member(Lower, Stack) andalso allow_once(Lower)) of
270            true ->
271                Attrs1 = lists:filter(fun({A,_}) -> allow_attr(A, ExtraAttrs) end, Attrs),
272                Attrs2 = [ {list_to_binary(z_string:to_lower(A)), V} || {A,V} <- Attrs1 ],
273                Stack1 = [Lower|Stack],
274                Tag = { Lower, 
275                        Attrs2,
276                        [ sanitize(Encl, Stack1, ExtraElts, ExtraAttrs, OptContext) || Encl <- Enclosed ]},
277                case OptContext of
278                    #context{} -> z_notifier:foldl(sanitize_element, Tag, OptContext);
279                    undefined -> Tag
280                end;
281            false ->
282                case skip_contents(Lower) of
283                    false ->
284                        {nop, [ sanitize(Encl, Stack, ExtraElts, ExtraAttrs, OptContext) || Encl <- Enclosed ]};
285                    true ->
286                        {nop, []}
287                end
288        end.
289
290    %% @doc Flatten the sanitized html tree to 
291    flatten(B) when is_binary(B) ->
292        escape_html_text(B, <<>>);
293    flatten({nop, Enclosed}) ->
294        flatten(Enclosed);
295    flatten({comment, Text}) ->
296        Comment = escape_html_comment(Text, <<>>),
297        <<"<!--", Comment/binary, "-->">>;
298    flatten({Elt, Attrs, Enclosed}) ->
299        EncBin = flatten(Enclosed),
300        Attrs1 = [flatten_attr(Attr) || Attr <- Attrs ],
301        Attrs2 = iolist_to_binary(z_utils:prefix(32, Attrs1)),
302        case is_selfclosing(Elt) andalso EncBin == <<>> of
303            true ->  <<$<, Elt/binary, Attrs2/binary, 32, $/, $>>>;
304            false -> <<$<, Elt/binary, Attrs2/binary, $>, EncBin/binary, $<, $/, Elt/binary, $>>>
305        end;
306    flatten(L) when is_list(L) -> 
307        iolist_to_binary([ flatten(A) || A <- L ]).
308    
309    %% @doc Flatten an attribute to a binary
310    %% @todo Filter javascript from the value (when there is a ':' then only allow http/https)
311    %% @todo Strip scripting and text css attributes
312    %% css: anything within () should be removed
313    flatten_attr({<<"style">>,Value}) ->
314        Value1 = escape(filter_css(Value), <<>>),
315        <<"style=\"", Value1/binary, $">>;
316    flatten_attr({<<"class">>,Value}) ->
317        % Remove all do_xxxx widget manager classes
318        Value1 = escape(filter_widget_class(Value)),
319        <<"class=\"", Value1/binary, $">>;
320    flatten_attr({Attr,Value}) ->
321        Value1 = case is_url_attr(Attr) of
322                    true -> noscript(Value);
323                    false -> Value
324                end,
325        Value2 = escape(Value1, <<>>),
326        <<Attr/binary, $=, $", Value2/binary, $">>.
327
328    %% @doc Escape smaller-than, greater-than, single and double quotes in texts (&amp; is already removed or escaped).
329    escape_html_text(<<>>, Acc) -> 
330        Acc;
331    escape_html_text(<<$<, T/binary>>, Acc) ->
332        escape_html_text(T, <<Acc/binary, "&lt;">>);
333    escape_html_text(<<$>, T/binary>>, Acc) ->
334        escape_html_text(T, <<Acc/binary, "&gt;">>);
335    escape_html_text(<<$", T/binary>>, Acc) ->
336        escape_html_text(T, <<Acc/binary, "&quot;">>);
337    escape_html_text(<<$', T/binary>>, Acc) ->
338        escape_html_text(T, <<Acc/binary, "&#39;">>);
339    escape_html_text(<<C, T/binary>>, Acc) ->
340        escape_html_text(T, <<Acc/binary, C>>).
341
342    %% @doc Escape smaller-than, greater-than (for in comments)
343    escape_html_comment(<<>>, Acc) -> 
344        Acc;
345    escape_html_comment(<<$<, T/binary>>, Acc) ->
346        escape_html_comment(T, <<Acc/binary, "&lt;">>);
347    escape_html_comment(<<$>, T/binary>>, Acc) ->
348        escape_html_comment(T, <<Acc/binary, "&gt;">>);
349    escape_html_comment(<<C, T/binary>>, Acc) ->
350        escape_html_comment(T, <<Acc/binary, C>>).
351
352    
353%% @doc Elements that can only occur once in a nesting.
354%% Used for cleaning up code from html editors.
355allow_once(<<"a">>) -> true;
356allow_once(<<"abbr">>) -> true;
357allow_once(<<"area">>) -> true;
358allow_once(<<"article">>) -> true;
359allow_once(<<"b">>) -> true;
360allow_once(<<"bdo">>) -> true;
361allow_once(<<"big">>) -> true;
362allow_once(<<"br">>) -> true;
363allow_once(<<"cite">>) -> true;
364allow_once(<<"del">>) -> true;
365allow_once(<<"dfn">>) -> true;
366allow_once(<<"em">>) -> true;
367allow_once(<<"hr">>) -> true;
368allow_once(<<"i">>) -> true;
369allow_once(<<"ins">>) -> true;
370allow_once(<<"nav">>) -> true;
371allow_once(<<"p">>) -> true;
372allow_once(<<"pre">>) -> true;
373allow_once(<<"q">>) -> true;
374allow_once(<<"s">>) -> true;
375allow_once(<<"small">>) -> true;
376allow_once(<<"sub">>) -> true;
377allow_once(<<"sup">>) -> true;
378allow_once(<<"strong">>) -> true;
379allow_once(<<"strike">>) -> true;
380allow_once(<<"tt">>) -> true;
381allow_once(<<"u">>) -> true;
382allow_once(<<"var">>) -> true;
383allow_once(_) -> false.
384
385%% @doc Allowed elements (see also allow_once/1 above)
386allow_elt(Elt, Extra) ->
387    allow_elt(Elt) orelse lists:member(Elt, Extra).
388
389allow_elt(<<"audio">>) -> true;
390allow_elt(<<"address">>) -> true;
391allow_elt(<<"bdo">>) -> true;
392allow_elt(<<"blockquote">>) -> true;
393allow_elt(<<"caption">>) -> true;
394allow_elt(<<"col">>) -> true;
395allow_elt(<<"colgroup">>) -> true;
396allow_elt(<<"dd">>) -> true;
397allow_elt(<<"dl">>) -> true;
398allow_elt(<<"dt">>) -> true;
399allow_elt(<<"div">>) -> true;
400allow_elt(<<"h1">>) -> true;
401allow_elt(<<"h2">>) -> true;
402allow_elt(<<"h3">>) -> true;
403allow_elt(<<"h4">>) -> true;
404allow_elt(<<"h5">>) -> true;
405allow_elt(<<"h6">>) -> true;
406allow_elt(<<"header">>) -> true;
407allow_elt(<<"img">>) -> true;
408allow_elt(<<"li">>) -> true;
409allow_elt(<<"legend">>) -> true;
410allow_elt(<<"map">>) -> true;
411allow_elt(<<"ol">>) -> true;
412allow_elt(<<"samp">>) -> true;
413allow_elt(<<"section">>) -> true;
414allow_elt(<<"source">>) -> true;
415allow_elt(<<"span">>) -> true;
416allow_elt(<<"table">>) -> true;
417allow_elt(<<"tbody">>) -> true;
418allow_elt(<<"tfoot">>) -> true;
419allow_elt(<<"thead">>) -> true;
420allow_elt(<<"td">>) -> true;
421allow_elt(<<"th">>) -> true;
422allow_elt(<<"tr">>) -> true;
423allow_elt(<<"ul">>) -> true;
424allow_elt(<<"video">>) -> true;
425allow_elt(_) -> false.
426
427%% @doc Allowed attributes
428allow_attr(Attr, Extra) ->
429    allow_attr(Attr) orelse lists:member(Attr, Extra).
430
431allow_attr(<<"align">>) -> true;
432allow_attr(<<"alt">>) -> true;
433allow_attr(<<"autoplay">>) -> true;
434allow_attr(<<"border">>) -> true;
435allow_attr(<<"borderspacing">>) -> true;
436allow_attr(<<"cellpadding">>) -> true;
437allow_attr(<<"cellspacing">>) -> true;
438allow_attr(<<"class">>) -> true;
439allow_attr(<<"colspan">>) -> true;
440allow_attr(<<"controls">>) -> true;
441allow_attr(<<"coords">>) -> true;
442allow_attr(<<"dir">>) -> true;
443allow_attr(<<"height">>) -> true;
444allow_attr(<<"href">>) -> true;
445%allow_attr(<<"id">>) -> true;
446allow_attr(<<"loop">>) -> true;
447allow_attr(<<"name">>) -> true;
448allow_attr(<<"poster">>) -> true;
449allow_attr(<<"preload">>) -> true;
450allow_attr(<<"rel">>) -> true;
451allow_attr(<<"rowspan">>) -> true;
452allow_attr(<<"shape">>) -> true;
453allow_attr(<<"src">>) -> true;
454allow_attr(<<"style">>) -> true;
455allow_attr(<<"target">>) -> true;
456allow_attr(<<"title">>) -> true;
457allow_attr(<<"usemap">>) -> true;
458allow_attr(<<"valign">>) -> true;
459allow_attr(<<"width">>) -> true;
460allow_attr(_) -> false.
461
462%% @doc Check if the attribute might contain an url
463is_url_attr(<<"src">>) -> true;
464is_url_attr(<<"href">>) -> true;
465is_url_attr(<<"poster">>) -> true;
466is_url_attr(_) -> false.
467
468%% @doc Elements that shouldn't use a open and close tag.
469is_selfclosing(<<"br">>) -> true;
470is_selfclosing(<<"hr">>) -> true;
471is_selfclosing(<<"img">>) -> true;
472is_selfclosing(_) -> false.
473
474%% @doc Disallowed elements whose contents should be skipped
475skip_contents(<<"style">>) -> true;
476skip_contents(<<"script">>) -> true;
477skip_contents(<<"deleteme">>) -> true;
478skip_contents(<<"head">>) -> true;
479skip_contents(_) -> false.
480
481%% @doc Simple filter for css. Removes parts between () and quoted strings. 
482filter_css(undefined) ->
483    [];
484filter_css(<<>>) ->
485    <<>>;
486filter_css([]) ->
487    [];
488filter_css(Html) when is_binary(Html) ->
489    filter_css(Html, in_text, <<>>);
490filter_css(L) when is_list(L) ->
491    filter_css(list_to_binary(L)).
492
493filter_css(<<>>, _, Acc) -> Acc;
494filter_css(<<$(,T/binary>>, in_text, Acc) ->
495    filter_css(T, in_paren, <<Acc/binary,$(>>);
496filter_css(<<$),T/binary>>, in_paren, Acc) ->
497    filter_css(T, in_text, <<Acc/binary,$)>>);
498filter_css(<<$),T/binary>>, State, Acc) ->
499    filter_css(T, State, Acc);
500filter_css(<<_,T/binary>>, in_paren, Acc) ->
501    filter_css(T, in_paren, Acc);
502filter_css(<<$",T/binary>>, in_text, Acc) ->
503    filter_css(T, in_dstring, <<Acc/binary,$">>);
504filter_css(<<$",T/binary>>, in_dstring, Acc) ->
505    filter_css(T, in_text, <<Acc/binary,$">>);
506filter_css(<<$',T/binary>>, in_text, Acc) ->
507    filter_css(T, in_sstring, <<Acc/binary,$'>>);
508filter_css(<<$',T/binary>>, in_sstring, Acc) ->
509    filter_css(T, in_text, <<Acc/binary,$'>>);
510filter_css(<<$\\,_,T/binary>>, in_sstring, Acc) ->
511    filter_css(T, in_sstring, Acc);
512filter_css(<<$\\,_,T/binary>>, in_dstring, Acc) ->
513    filter_css(T, in_dstring, Acc);
514filter_css(<<$\\,H,T/binary>>, in_text, Acc) ->
515    filter_css(T, in_text, <<Acc/binary,H>>);
516filter_css(<<H,T/binary>>, in_text, Acc) ->
517    filter_css(T, in_text, <<Acc/binary, H>>);
518filter_css(<<_,T/binary>>, State, Acc) ->
519    filter_css(T, State, Acc).
520
521%% @doc Remove all do_xxxx classes to prevent widget manager invocations
522filter_widget_class(Class) ->
523    z_convert:to_binary(re:replace(Class, <<"do_[0-9a-zA-Z_]+">>, <<>>, [global])).
524
525%% @doc Filter a url, remove any javascript.
526noscript(Url) -> 
527    case nows(z_convert:to_list(Url), []) of
528        "script:" ++ _ -> <<"#script-removed">>;
529        _ -> Url
530    end.
531
532    %% @doc Remove whitespace and make lowercase till we find a colon or slash.
533    nows([], Acc) -> lists:reverse(Acc);
534    nows([C|_] = L, Acc) when C =:= $:; C =:= $/ -> lists:reverse(Acc, L);
535    nows([C|T], Acc) when C =< 32 -> nows(T,Acc);
536    nows([C|T], Acc) when C >= $A, C =< $Z -> nows(T, [C+32|Acc]);
537    nows([$\\|T], Acc) -> nows(T, Acc);
538    nows([C|T], Acc) -> nows(T, [C|Acc]).
539
540
541%% @doc Translate any newlines to html br entities.
542nl2br(B) when is_binary(B) ->
543    nl2br_bin(B, <<>>);
544nl2br(L) ->
545    nl2br(L, []).
546
547    nl2br([], Acc) ->
548        lists:reverse(Acc);
549    nl2br("\r\n" ++ Rest, Acc) ->
550        nl2br(Rest, lists:reverse("<br />", Acc));
551    nl2br("\n" ++ Rest, Acc) ->
552        nl2br(Rest, lists:reverse("<br />", Acc));
553    nl2br([C | Rest], Acc) ->
554        nl2br(Rest, [C | Acc]).
555
556    nl2br_bin(<<>>, Acc) ->
557        Acc;
558    nl2br_bin(<<$\r, $\n, Post/binary>>, Acc) ->
559        nl2br_bin(Post, <<Acc/binary, "<br />">>);
560    nl2br_bin(<<$\r, Post/binary>>, Acc) ->
561        nl2br_bin(Post, <<Acc/binary, "<br />">>);
562    nl2br_bin(<<$\n, Post/binary>>, Acc) ->
563        nl2br_bin(Post, <<Acc/binary, "<br />">>);
564    nl2br_bin(<<C, Post/binary>>, Acc) ->
565        nl2br_bin(Post, <<Acc/binary, C>>).
566        
567
568%% @doc Given a HTML list, scrape all `<link>' elements and return their attributes. Attribute names are lowercased.
569%% @spec scrape_link_elements(string()) -> [LinkAttributes]
570scrape_link_elements(Html) ->
571    case re:run(Html, "<link[^>]+>", [global, caseless, {capture,all,list}]) of
572        {match, Elements} ->
573            F = fun(El) ->
574                        H = iolist_to_binary(["<p>", El, "</p>"]),
575                        {<<"p">>, [], [{_, Attrs, []}]} = mochiweb_html:parse(H),
576                        [{z_string:to_lower(binary_to_list(K)),binary_to_list(V)} || {K,V} <- lists:flatten(Attrs)]
577                end,
578            [F(El) || [El] <- Elements];
579        nomatch ->
580            []
581    end.
582
583
584%% @doc Ensure that `&'-characters are properly escaped inside a html string.
585ensure_escaped_amp(B) ->
586    ensure_escaped_amp(B, <<>>).
587
588ensure_escaped_amp(<<>>, Acc) ->
589    Acc;
590ensure_escaped_amp(<<$&, Rest/binary>>, Acc) ->
591    case try_amp(Rest, in_amp, <<>>) of
592        {Amp,Rest1} -> ensure_escaped_amp(Rest1, <<Acc/binary, $&, Amp/binary>>);
593        false -> ensure_escaped_amp(Rest, <<Acc/binary, "&amp;">>)
594    end;
595ensure_escaped_amp(<<C, Rest/binary>>, Acc) ->
596    ensure_escaped_amp(Rest, <<Acc/binary, C>>).
597
598
599    try_amp(<<$;,Rest/binary>>, in_ent_name, Acc) ->
600        {<<Acc/binary,$;>>, Rest};
601    try_amp(<<$;,Rest/binary>>, in_ent_val, Acc) ->
602        {<<Acc/binary,$;>>, Rest};
603    try_amp(<<$#,Rest/binary>>, in_amp, <<>>) -> 
604        try_amp(Rest, in_ent_val, <<$#>>);
605    try_amp(<<C,Rest/binary>>, in_ent_val, Acc) ->
606        case is_valid_ent_val(C) of
607            true -> try_amp(Rest, in_ent_val, <<Acc/binary,C>>);
608            false -> false
609        end;
610    try_amp(<<C,Rest/binary>>, in_amp, <<>>) ->
611        case is_valid_ent_char(C) of
612            true -> try_amp(Rest, in_ent_name, <<C>>);
613            false -> false
614        end;
615    try_amp(<<C,Rest/binary>>, in_ent_name, Acc) ->
616        case is_valid_ent_char(C) of
617            true -> try_amp(Rest, in_ent_name, <<Acc/binary, C>>);
618            false -> false
619        end;
620    try_amp(_B, _, _Acc) -> 
621        false.
622
623
624    is_valid_ent_char(C) ->
625        (C >= $a andalso C =< $z) orelse (C >= $A andalso C =< $Z).
626
627    is_valid_ent_val(C) -> 
628        (C >= $a andalso C =< $f) orelse (C >= $A andalso C =< $F)
629        orelse (C >= $0 andalso C =< $9).