/src/support/z_html.erl

http://github.com/zotonic/zotonic · Erlang · 629 lines · 512 code · 61 blank · 56 comment · 13 complexity · 7372422d66bc019f560385c0932d8a73 MD5 · raw file

  1. %% @author Marc Worrell <marc@worrell.nl>
  2. %% @copyright 2009-2012 Marc Worrell
  3. %% Date: 2009-04-17
  4. %%
  5. %% @doc Utility functions for html processing. Also used for property filtering (by m_rsc_update).
  6. %% Copyright 2009-2012 Marc Worrell
  7. %%
  8. %% Licensed under the Apache License, Version 2.0 (the "License");
  9. %% you may not use this file except in compliance with the License.
  10. %% You may obtain a copy of the License at
  11. %%
  12. %% http://www.apache.org/licenses/LICENSE-2.0
  13. %%
  14. %% Unless required by applicable law or agreed to in writing, software
  15. %% distributed under the License is distributed on an "AS IS" BASIS,
  16. %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17. %% See the License for the specific language governing permissions and
  18. %% limitations under the License.
  19. -module(z_html).
  20. -author("Marc Worrell <marc@worrell.nl").
  21. %% interface functions
  22. -export([
  23. escape_props/1,
  24. escape_props/2,
  25. escape/1,
  26. unescape/1,
  27. strip/1,
  28. sanitize/1,
  29. sanitize/2,
  30. noscript/1,
  31. escape_link/1,
  32. nl2br/1,
  33. scrape_link_elements/1,
  34. ensure_escaped_amp/1
  35. ]).
  36. -include_lib("zotonic.hrl").
  37. %% @doc Escape all properties used for an update statement. Only leaves the body property intact.
  38. %% @spec escape_props(PropertyList) -> PropertyList
  39. escape_props(Props) ->
  40. escape_props1(Props, [], undefined).
  41. %% @spec escape_props(PropertyList, context()) -> PropertyList
  42. escape_props(Props, Context) ->
  43. escape_props1(Props, [], Context).
  44. escape_props1([], Acc, _OptContext) ->
  45. Acc;
  46. escape_props1([{_K,V} = Prop|T], Acc, OptContext) when is_float(V); is_integer(V); is_atom(V) ->
  47. escape_props1(T, [Prop|Acc], OptContext);
  48. escape_props1([{K, V}|T], Acc, OptContext) when K =:= body orelse K =:= body_extra->
  49. escape_props1(T, [{K, sanitize(V, OptContext)} | Acc], OptContext);
  50. escape_props1([{K, V}|T], Acc, OptContext) ->
  51. EscapeFun = case lists:reverse(z_convert:to_list(K)) of
  52. "lmth_" ++ _ -> fun(A) -> sanitize(A, OptContext) end; %% prop ends in '_html'
  53. _ -> fun escape_value/1
  54. end,
  55. escape_props1(T, [{K, EscapeFun(V)} | Acc], OptContext).
  56. escape_value({trans, Texts}) ->
  57. {trans, escape_props(Texts)};
  58. escape_value(V) when is_list(V) ->
  59. try
  60. escape_value(iolist_to_binary(V))
  61. catch _:_ ->
  62. V
  63. end;
  64. escape_value(B) when is_binary(B) ->
  65. escape(B);
  66. escape_value(V) ->
  67. V.
  68. %% @doc Escape a string so that it is valid within HTML/ XML.
  69. %% @spec escape(iolist()) -> binary()
  70. escape({trans, Tr}) ->
  71. {trans, [{Lang, escape(V)} || {Lang,V} <- Tr]};
  72. escape(undefined) ->
  73. undefined;
  74. escape(<<>>) ->
  75. <<>>;
  76. escape([]) ->
  77. <<>>;
  78. escape(L) when is_list(L) ->
  79. escape(list_to_binary(L));
  80. escape(B) when is_binary(B) ->
  81. escape(B, <<>>).
  82. escape(<<>>, Acc) ->
  83. Acc;
  84. escape(<<"&euro;", T/binary>>, Acc) ->
  85. escape(T, <<Acc/binary, "â&#x201A;?">>);
  86. escape(<<$&, T/binary>>, Acc) ->
  87. escape(T, <<Acc/binary, "&amp;">>);
  88. escape(<<$<, T/binary>>, Acc) ->
  89. escape(T, <<Acc/binary, "&lt;">>);
  90. escape(<<$>, T/binary>>, Acc) ->
  91. escape(T, <<Acc/binary, "&gt;">>);
  92. escape(<<$", T/binary>>, Acc) ->
  93. escape(T, <<Acc/binary, "&quot;">>);
  94. escape(<<$', T/binary>>, Acc) ->
  95. escape(T, <<Acc/binary, "&#39;">>);
  96. escape(<<C, T/binary>>, Acc) ->
  97. escape(T, <<Acc/binary, C>>).
  98. %% @doc Unescape - reverses the effect of escape.
  99. %% @spec unescape(iolist()) -> binary()
  100. unescape({trans, Tr}) ->
  101. {trans, [{Lang, unescape(V)} || {Lang,V} <- Tr]};
  102. unescape(undefined) ->
  103. undefined;
  104. unescape(<<>>) ->
  105. <<>>;
  106. unescape([]) ->
  107. <<>>;
  108. unescape(L) when is_list(L) ->
  109. unescape(list_to_binary(L));
  110. unescape(B) when is_binary(B) ->
  111. unescape(B, <<>>).
  112. unescape(<<>>, Acc) ->
  113. Acc;
  114. unescape(<<"&amp;", T/binary>>, Acc) ->
  115. unescape(T, <<Acc/binary, "&">>);
  116. unescape(<<"&quot;", T/binary>>, Acc) ->
  117. unescape(T, <<Acc/binary, "\"">>);
  118. unescape(<<"&#39;", T/binary>>, Acc) ->
  119. unescape(T, <<Acc/binary, "'">>);
  120. unescape(<<"&lt;", T/binary>>, Acc) ->
  121. unescape(T, <<Acc/binary, "<">>);
  122. unescape(<<"&gt;", T/binary>>, Acc) ->
  123. unescape(T, <<Acc/binary, ">">>);
  124. unescape(<<"&euro;", T/binary>>, Acc) ->
  125. unescape(T, <<Acc/binary, "â&#x201A;?">>);
  126. unescape(<<C, T/binary>>, Acc) ->
  127. unescape(T, <<Acc/binary, C>>).
  128. %% @doc Escape a text. Expands any urls to links with a nofollow attribute.
  129. %% @spec escape_link(Text) -> binary()
  130. escape_link(undefined) ->
  131. undefined;
  132. escape_link(<<>>) ->
  133. <<>>;
  134. escape_link([]) ->
  135. <<>>;
  136. escape_link(Text) ->
  137. case re:run(Text, "\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^[:punct:]\\s]|/)))", [{capture, first, index}, global]) of
  138. {match, Matches} ->
  139. Matches1 = [ hd(M) || M <- Matches ],
  140. nl2br(iolist_to_binary(make_links1(0, Matches1, z_convert:to_list(Text), [])));
  141. nomatch ->
  142. nl2br(escape(Text))
  143. end.
  144. make_links1(_Offset, [], Text, Acc) ->
  145. lists:reverse([escape(Text) | Acc]);
  146. make_links1(Offset, [{Offset, Len}|Rest], Text, Acc) ->
  147. {Link, Text1} = lists:split(Len, Text),
  148. NoScript = noscript(Link),
  149. Link1 = escape(NoScript),
  150. Link2 = escape(ensure_protocol(NoScript)),
  151. make_links1(Offset+Len, Rest, Text1, [["<a href=\"",Link2,"\" rel=\"nofollow\">",Link1,"</a>"] | Acc]);
  152. make_links1(Offset, [{MatchOffs,_}|_] = Matches, Text, Acc) ->
  153. {Text1,Text2} = lists:split(MatchOffs-Offset, Text),
  154. make_links1(MatchOffs, Matches, Text2, [escape(Text1)|Acc]).
  155. ensure_protocol([]) ->
  156. [];
  157. ensure_protocol("#" ++ _ = Link) ->
  158. Link;
  159. ensure_protocol("www" ++ Rest) ->
  160. ["http://www", Rest];
  161. ensure_protocol(Link) ->
  162. Link.
  163. %% @doc Strip all html elements from the text. Simple parsing is applied to find the elements. Does not escape the end result.
  164. %% @spec strip(iolist()) -> iolist()
  165. strip({trans, Tr}) ->
  166. {trans, [{Lang, strip(V)} || {Lang,V} <- Tr]};
  167. strip(undefined) ->
  168. [];
  169. strip(<<>>) ->
  170. <<>>;
  171. strip([]) ->
  172. [];
  173. strip(Html) when is_binary(Html) ->
  174. strip(Html, in_text, <<>>);
  175. strip(L) when is_list(L) ->
  176. strip(list_to_binary(L)).
  177. strip(<<>>, _, Acc) -> Acc;
  178. strip(<<$<,T/binary>>, in_text, Acc) ->
  179. strip(T, in_tag, Acc);
  180. strip(<<$>,T/binary>>, in_tag, Acc) ->
  181. strip(T, in_text, <<Acc/binary, 32>>);
  182. strip(<<$>,T/binary>>, State, Acc) ->
  183. strip(T, State, Acc);
  184. strip(<<$<,T/binary>>, State, Acc) ->
  185. strip(T, State, Acc);
  186. strip(<<$\\,_,T/binary>>, in_dstring, Acc) ->
  187. strip(T, in_dstring, Acc);
  188. strip(<<$\\,_,T/binary>>, in_sstring, Acc) ->
  189. strip(T, in_sstring, Acc);
  190. strip(<<$",T/binary>>, in_tag, Acc) ->
  191. strip(T, in_dstring, Acc);
  192. strip(<<$",T/binary>>, in_dstring, Acc) ->
  193. strip(T, in_tag, Acc);
  194. strip(<<$',T/binary>>, in_tag, Acc) ->
  195. strip(T, in_sstring, Acc);
  196. strip(<<$',T/binary>>, in_sstring, Acc) ->
  197. strip(T, in_tag, Acc);
  198. strip(<<H,T/binary>>, in_text, Acc) ->
  199. strip(T, in_text, <<Acc/binary, H>>);
  200. strip(<<_,T/binary>>, State, Acc) ->
  201. strip(T, State, Acc).
  202. %% @doc Sanitize a (X)HTML string. Remove elements and attributes that might be harmful.
  203. %% @spec sanitize(binary()) -> binary()
  204. sanitize(Html) ->
  205. sanitize(Html, undefined).
  206. sanitize({trans, Tr}, OptContext) ->
  207. {trans, [{Lang, sanitize(V, OptContext)} || {Lang,V} <- Tr]};
  208. sanitize(Html, OptContext) when is_binary(Html) ->
  209. sanitize_opts(<<"<sanitize>",Html/binary,"</sanitize>">>, OptContext);
  210. sanitize(Html, OptContext) when is_list(Html) ->
  211. sanitize_opts(iolist_to_binary(["<sanitize>", Html, "</sanitize>"]), OptContext).
  212. sanitize_opts(Html, OptContext) ->
  213. ExtraAttrs = case OptContext of
  214. #context{} ->
  215. binstr:split(m_config:get_value(site, html_attr_extra, <<>>, OptContext), <<",">>);
  216. undefined ->
  217. []
  218. end,
  219. ExtraElts = case OptContext of
  220. #context{} ->
  221. binstr:split(m_config:get_value(site, html_elt_extra, <<>>, OptContext), <<",">>);
  222. undefined ->
  223. []
  224. end,
  225. sanitize1(Html, ExtraElts, ExtraAttrs, OptContext).
  226. sanitize1(Html, ExtraElts, ExtraAttrs, OptContext) ->
  227. Parsed = mochiweb_html:parse(ensure_escaped_amp(Html)),
  228. Sanitized = sanitize(Parsed, [], ExtraElts, ExtraAttrs, OptContext),
  229. flatten(Sanitized).
  230. sanitize(B, _Stack, _ExtraElts, _ExtraAttrs, _OptContext) when is_binary(B) ->
  231. escape(B);
  232. sanitize({comment, Text}, _Stack, _ExtraElts, _ExtraAttrs, _OptContext) ->
  233. {comment, Text};
  234. sanitize({pi, _Raw}, _Stack, _ExtraElts, _ExtraAttrs, _OptContext) ->
  235. <<>>;
  236. sanitize({pi, _Tag, _Attrs}, _Stack, _ExtraElts, _ExtraAttrs, _OptContext) ->
  237. <<>>;
  238. sanitize({Elt,Attrs,Enclosed}, Stack, ExtraElts, ExtraAttrs, OptContext) ->
  239. Lower = list_to_binary(z_string:to_lower(Elt)),
  240. case allow_elt(Lower, ExtraElts) orelse (not lists:member(Lower, Stack) andalso allow_once(Lower)) of
  241. true ->
  242. Attrs1 = lists:filter(fun({A,_}) -> allow_attr(A, ExtraAttrs) end, Attrs),
  243. Attrs2 = [ {list_to_binary(z_string:to_lower(A)), V} || {A,V} <- Attrs1 ],
  244. Stack1 = [Lower|Stack],
  245. Tag = { Lower,
  246. Attrs2,
  247. [ sanitize(Encl, Stack1, ExtraElts, ExtraAttrs, OptContext) || Encl <- Enclosed ]},
  248. case OptContext of
  249. #context{} -> z_notifier:foldl(sanitize_element, Tag, OptContext);
  250. undefined -> Tag
  251. end;
  252. false ->
  253. case skip_contents(Lower) of
  254. false ->
  255. {nop, [ sanitize(Encl, Stack, ExtraElts, ExtraAttrs, OptContext) || Encl <- Enclosed ]};
  256. true ->
  257. {nop, []}
  258. end
  259. end.
  260. %% @doc Flatten the sanitized html tree to
  261. flatten(B) when is_binary(B) ->
  262. escape_html_text(B, <<>>);
  263. flatten({nop, Enclosed}) ->
  264. flatten(Enclosed);
  265. flatten({comment, Text}) ->
  266. Comment = escape_html_comment(Text, <<>>),
  267. <<"<!--", Comment/binary, "-->">>;
  268. flatten({Elt, Attrs, Enclosed}) ->
  269. EncBin = flatten(Enclosed),
  270. Attrs1 = [flatten_attr(Attr) || Attr <- Attrs ],
  271. Attrs2 = iolist_to_binary(z_utils:prefix(32, Attrs1)),
  272. case is_selfclosing(Elt) andalso EncBin == <<>> of
  273. true -> <<$<, Elt/binary, Attrs2/binary, 32, $/, $>>>;
  274. false -> <<$<, Elt/binary, Attrs2/binary, $>, EncBin/binary, $<, $/, Elt/binary, $>>>
  275. end;
  276. flatten(L) when is_list(L) ->
  277. iolist_to_binary([ flatten(A) || A <- L ]).
  278. %% @doc Flatten an attribute to a binary
  279. %% @todo Filter javascript from the value (when there is a ':' then only allow http/https)
  280. %% @todo Strip scripting and text css attributes
  281. %% css: anything within () should be removed
  282. flatten_attr({<<"style">>,Value}) ->
  283. Value1 = escape(filter_css(Value), <<>>),
  284. <<"style=\"", Value1/binary, $">>;
  285. flatten_attr({<<"class">>,Value}) ->
  286. % Remove all do_xxxx widget manager classes
  287. Value1 = escape(filter_widget_class(Value)),
  288. <<"class=\"", Value1/binary, $">>;
  289. flatten_attr({Attr,Value}) ->
  290. Value1 = case is_url_attr(Attr) of
  291. true -> noscript(Value);
  292. false -> Value
  293. end,
  294. Value2 = escape(Value1, <<>>),
  295. <<Attr/binary, $=, $", Value2/binary, $">>.
  296. %% @doc Escape smaller-than, greater-than, single and double quotes in texts (&amp; is already removed or escaped).
  297. escape_html_text(<<>>, Acc) ->
  298. Acc;
  299. escape_html_text(<<$<, T/binary>>, Acc) ->
  300. escape_html_text(T, <<Acc/binary, "&lt;">>);
  301. escape_html_text(<<$>, T/binary>>, Acc) ->
  302. escape_html_text(T, <<Acc/binary, "&gt;">>);
  303. escape_html_text(<<$", T/binary>>, Acc) ->
  304. escape_html_text(T, <<Acc/binary, "&quot;">>);
  305. escape_html_text(<<$', T/binary>>, Acc) ->
  306. escape_html_text(T, <<Acc/binary, "&#39;">>);
  307. escape_html_text(<<C, T/binary>>, Acc) ->
  308. escape_html_text(T, <<Acc/binary, C>>).
  309. %% @doc Escape smaller-than, greater-than (for in comments)
  310. escape_html_comment(<<>>, Acc) ->
  311. Acc;
  312. escape_html_comment(<<$<, T/binary>>, Acc) ->
  313. escape_html_comment(T, <<Acc/binary, "&lt;">>);
  314. escape_html_comment(<<$>, T/binary>>, Acc) ->
  315. escape_html_comment(T, <<Acc/binary, "&gt;">>);
  316. escape_html_comment(<<C, T/binary>>, Acc) ->
  317. escape_html_comment(T, <<Acc/binary, C>>).
  318. %% @doc Elements that can only occur once in a nesting.
  319. %% Used for cleaning up code from html editors.
  320. allow_once(<<"a">>) -> true;
  321. allow_once(<<"abbr">>) -> true;
  322. allow_once(<<"area">>) -> true;
  323. allow_once(<<"article">>) -> true;
  324. allow_once(<<"b">>) -> true;
  325. allow_once(<<"bdo">>) -> true;
  326. allow_once(<<"big">>) -> true;
  327. allow_once(<<"br">>) -> true;
  328. allow_once(<<"cite">>) -> true;
  329. allow_once(<<"del">>) -> true;
  330. allow_once(<<"dfn">>) -> true;
  331. allow_once(<<"em">>) -> true;
  332. allow_once(<<"hr">>) -> true;
  333. allow_once(<<"i">>) -> true;
  334. allow_once(<<"ins">>) -> true;
  335. allow_once(<<"nav">>) -> true;
  336. allow_once(<<"p">>) -> true;
  337. allow_once(<<"pre">>) -> true;
  338. allow_once(<<"q">>) -> true;
  339. allow_once(<<"s">>) -> true;
  340. allow_once(<<"small">>) -> true;
  341. allow_once(<<"sub">>) -> true;
  342. allow_once(<<"sup">>) -> true;
  343. allow_once(<<"strong">>) -> true;
  344. allow_once(<<"strike">>) -> true;
  345. allow_once(<<"tt">>) -> true;
  346. allow_once(<<"u">>) -> true;
  347. allow_once(<<"var">>) -> true;
  348. allow_once(_) -> false.
  349. %% @doc Allowed elements (see also allow_once/1 above)
  350. allow_elt(Elt, Extra) ->
  351. allow_elt(Elt) orelse lists:member(Elt, Extra).
  352. allow_elt(<<"audio">>) -> true;
  353. allow_elt(<<"address">>) -> true;
  354. allow_elt(<<"bdo">>) -> true;
  355. allow_elt(<<"blockquote">>) -> true;
  356. allow_elt(<<"caption">>) -> true;
  357. allow_elt(<<"col">>) -> true;
  358. allow_elt(<<"colgroup">>) -> true;
  359. allow_elt(<<"dd">>) -> true;
  360. allow_elt(<<"dl">>) -> true;
  361. allow_elt(<<"dt">>) -> true;
  362. allow_elt(<<"div">>) -> true;
  363. allow_elt(<<"h1">>) -> true;
  364. allow_elt(<<"h2">>) -> true;
  365. allow_elt(<<"h3">>) -> true;
  366. allow_elt(<<"h4">>) -> true;
  367. allow_elt(<<"h5">>) -> true;
  368. allow_elt(<<"h6">>) -> true;
  369. allow_elt(<<"header">>) -> true;
  370. allow_elt(<<"img">>) -> true;
  371. allow_elt(<<"li">>) -> true;
  372. allow_elt(<<"legend">>) -> true;
  373. allow_elt(<<"map">>) -> true;
  374. allow_elt(<<"ol">>) -> true;
  375. allow_elt(<<"samp">>) -> true;
  376. allow_elt(<<"section">>) -> true;
  377. allow_elt(<<"source">>) -> true;
  378. allow_elt(<<"span">>) -> true;
  379. allow_elt(<<"table">>) -> true;
  380. allow_elt(<<"tbody">>) -> true;
  381. allow_elt(<<"tfoot">>) -> true;
  382. allow_elt(<<"thead">>) -> true;
  383. allow_elt(<<"td">>) -> true;
  384. allow_elt(<<"th">>) -> true;
  385. allow_elt(<<"tr">>) -> true;
  386. allow_elt(<<"ul">>) -> true;
  387. allow_elt(<<"video">>) -> true;
  388. allow_elt(_) -> false.
  389. %% @doc Allowed attributes
  390. allow_attr(Attr, Extra) ->
  391. allow_attr(Attr) orelse lists:member(Attr, Extra).
  392. allow_attr(<<"align">>) -> true;
  393. allow_attr(<<"alt">>) -> true;
  394. allow_attr(<<"autoplay">>) -> true;
  395. allow_attr(<<"border">>) -> true;
  396. allow_attr(<<"borderspacing">>) -> true;
  397. allow_attr(<<"cellpadding">>) -> true;
  398. allow_attr(<<"cellspacing">>) -> true;
  399. allow_attr(<<"class">>) -> true;
  400. allow_attr(<<"colspan">>) -> true;
  401. allow_attr(<<"controls">>) -> true;
  402. allow_attr(<<"coords">>) -> true;
  403. allow_attr(<<"dir">>) -> true;
  404. allow_attr(<<"height">>) -> true;
  405. allow_attr(<<"href">>) -> true;
  406. %allow_attr(<<"id">>) -> true;
  407. allow_attr(<<"loop">>) -> true;
  408. allow_attr(<<"name">>) -> true;
  409. allow_attr(<<"poster">>) -> true;
  410. allow_attr(<<"preload">>) -> true;
  411. allow_attr(<<"rel">>) -> true;
  412. allow_attr(<<"rowspan">>) -> true;
  413. allow_attr(<<"shape">>) -> true;
  414. allow_attr(<<"src">>) -> true;
  415. allow_attr(<<"style">>) -> true;
  416. allow_attr(<<"target">>) -> true;
  417. allow_attr(<<"title">>) -> true;
  418. allow_attr(<<"usemap">>) -> true;
  419. allow_attr(<<"valign">>) -> true;
  420. allow_attr(<<"width">>) -> true;
  421. allow_attr(_) -> false.
  422. %% @doc Check if the attribute might contain an url
  423. is_url_attr(<<"src">>) -> true;
  424. is_url_attr(<<"href">>) -> true;
  425. is_url_attr(<<"poster">>) -> true;
  426. is_url_attr(_) -> false.
  427. %% @doc Elements that shouldn't use a open and close tag.
  428. is_selfclosing(<<"br">>) -> true;
  429. is_selfclosing(<<"hr">>) -> true;
  430. is_selfclosing(<<"img">>) -> true;
  431. is_selfclosing(_) -> false.
  432. %% @doc Disallowed elements whose contents should be skipped
  433. skip_contents(<<"style">>) -> true;
  434. skip_contents(<<"script">>) -> true;
  435. skip_contents(<<"deleteme">>) -> true;
  436. skip_contents(<<"head">>) -> true;
  437. skip_contents(_) -> false.
  438. %% @doc Simple filter for css. Removes parts between () and quoted strings.
  439. filter_css(undefined) ->
  440. [];
  441. filter_css(<<>>) ->
  442. <<>>;
  443. filter_css([]) ->
  444. [];
  445. filter_css(Html) when is_binary(Html) ->
  446. filter_css(Html, in_text, <<>>);
  447. filter_css(L) when is_list(L) ->
  448. filter_css(list_to_binary(L)).
  449. filter_css(<<>>, _, Acc) -> Acc;
  450. filter_css(<<$(,T/binary>>, in_text, Acc) ->
  451. filter_css(T, in_paren, <<Acc/binary,$(>>);
  452. filter_css(<<$),T/binary>>, in_paren, Acc) ->
  453. filter_css(T, in_text, <<Acc/binary,$)>>);
  454. filter_css(<<$),T/binary>>, State, Acc) ->
  455. filter_css(T, State, Acc);
  456. filter_css(<<_,T/binary>>, in_paren, Acc) ->
  457. filter_css(T, in_paren, Acc);
  458. filter_css(<<$",T/binary>>, in_text, Acc) ->
  459. filter_css(T, in_dstring, <<Acc/binary,$">>);
  460. filter_css(<<$",T/binary>>, in_dstring, Acc) ->
  461. filter_css(T, in_text, <<Acc/binary,$">>);
  462. filter_css(<<$',T/binary>>, in_text, Acc) ->
  463. filter_css(T, in_sstring, <<Acc/binary,$'>>);
  464. filter_css(<<$',T/binary>>, in_sstring, Acc) ->
  465. filter_css(T, in_text, <<Acc/binary,$'>>);
  466. filter_css(<<$\\,_,T/binary>>, in_sstring, Acc) ->
  467. filter_css(T, in_sstring, Acc);
  468. filter_css(<<$\\,_,T/binary>>, in_dstring, Acc) ->
  469. filter_css(T, in_dstring, Acc);
  470. filter_css(<<$\\,H,T/binary>>, in_text, Acc) ->
  471. filter_css(T, in_text, <<Acc/binary,H>>);
  472. filter_css(<<H,T/binary>>, in_text, Acc) ->
  473. filter_css(T, in_text, <<Acc/binary, H>>);
  474. filter_css(<<_,T/binary>>, State, Acc) ->
  475. filter_css(T, State, Acc).
  476. %% @doc Remove all do_xxxx classes to prevent widget manager invocations
  477. filter_widget_class(Class) ->
  478. z_convert:to_binary(re:replace(Class, <<"do_[0-9a-zA-Z_]+">>, <<>>, [global])).
  479. %% @doc Filter a url, remove any javascript.
  480. noscript(Url) ->
  481. case nows(z_convert:to_list(Url), []) of
  482. "script:" ++ _ -> <<"#script-removed">>;
  483. _ -> Url
  484. end.
  485. %% @doc Remove whitespace and make lowercase till we find a colon or slash.
  486. nows([], Acc) -> lists:reverse(Acc);
  487. nows([C|_] = L, Acc) when C =:= $:; C =:= $/ -> lists:reverse(Acc, L);
  488. nows([C|T], Acc) when C =< 32 -> nows(T,Acc);
  489. nows([C|T], Acc) when C >= $A, C =< $Z -> nows(T, [C+32|Acc]);
  490. nows([$\\|T], Acc) -> nows(T, Acc);
  491. nows([C|T], Acc) -> nows(T, [C|Acc]).
  492. %% @doc Translate any newlines to html br entities.
  493. nl2br(B) when is_binary(B) ->
  494. nl2br_bin(B, <<>>);
  495. nl2br(L) ->
  496. nl2br(L, []).
  497. nl2br([], Acc) ->
  498. lists:reverse(Acc);
  499. nl2br("\r\n" ++ Rest, Acc) ->
  500. nl2br(Rest, lists:reverse("<br />", Acc));
  501. nl2br("\n" ++ Rest, Acc) ->
  502. nl2br(Rest, lists:reverse("<br />", Acc));
  503. nl2br([C | Rest], Acc) ->
  504. nl2br(Rest, [C | Acc]).
  505. nl2br_bin(<<>>, Acc) ->
  506. Acc;
  507. nl2br_bin(<<$\r, $\n, Post/binary>>, Acc) ->
  508. nl2br_bin(Post, <<Acc/binary, "<br />">>);
  509. nl2br_bin(<<$\r, Post/binary>>, Acc) ->
  510. nl2br_bin(Post, <<Acc/binary, "<br />">>);
  511. nl2br_bin(<<$\n, Post/binary>>, Acc) ->
  512. nl2br_bin(Post, <<Acc/binary, "<br />">>);
  513. nl2br_bin(<<C, Post/binary>>, Acc) ->
  514. nl2br_bin(Post, <<Acc/binary, C>>).
  515. %% @doc Given a HTML list, scrape all `<link>' elements and return their attributes. Attribute names are lowercased.
  516. %% @spec scrape_link_elements(string()) -> [LinkAttributes]
  517. scrape_link_elements(Html) ->
  518. case re:run(Html, "<link[^>]+>", [global, caseless, {capture,all,list}]) of
  519. {match, Elements} ->
  520. F = fun(El) ->
  521. H = iolist_to_binary(["<p>", El, "</p>"]),
  522. {<<"p">>, [], [{_, Attrs, []}]} = mochiweb_html:parse(H),
  523. [{z_string:to_lower(binary_to_list(K)),binary_to_list(V)} || {K,V} <- lists:flatten(Attrs)]
  524. end,
  525. [F(El) || [El] <- Elements];
  526. nomatch ->
  527. []
  528. end.
  529. %% @doc Ensure that `&'-characters are properly escaped inside a html string.
  530. ensure_escaped_amp(B) ->
  531. ensure_escaped_amp(B, <<>>).
  532. ensure_escaped_amp(<<>>, Acc) ->
  533. Acc;
  534. ensure_escaped_amp(<<$&, Rest/binary>>, Acc) ->
  535. case try_amp(Rest, in_amp, <<>>) of
  536. {Amp,Rest1} -> ensure_escaped_amp(Rest1, <<Acc/binary, $&, Amp/binary>>);
  537. false -> ensure_escaped_amp(Rest, <<Acc/binary, "&amp;">>)
  538. end;
  539. ensure_escaped_amp(<<C, Rest/binary>>, Acc) ->
  540. ensure_escaped_amp(Rest, <<Acc/binary, C>>).
  541. try_amp(<<$;,Rest/binary>>, in_ent_name, Acc) ->
  542. {<<Acc/binary,$;>>, Rest};
  543. try_amp(<<$;,Rest/binary>>, in_ent_val, Acc) ->
  544. {<<Acc/binary,$;>>, Rest};
  545. try_amp(<<$#,Rest/binary>>, in_amp, <<>>) ->
  546. try_amp(Rest, in_ent_val, <<$#>>);
  547. try_amp(<<C,Rest/binary>>, in_ent_val, Acc) ->
  548. case is_valid_ent_val(C) of
  549. true -> try_amp(Rest, in_ent_val, <<Acc/binary,C>>);
  550. false -> false
  551. end;
  552. try_amp(<<C,Rest/binary>>, in_amp, <<>>) ->
  553. case is_valid_ent_char(C) of
  554. true -> try_amp(Rest, in_ent_name, <<C>>);
  555. false -> false
  556. end;
  557. try_amp(<<C,Rest/binary>>, in_ent_name, Acc) ->
  558. case is_valid_ent_char(C) of
  559. true -> try_amp(Rest, in_ent_name, <<Acc/binary, C>>);
  560. false -> false
  561. end;
  562. try_amp(_B, _, _Acc) ->
  563. false.
  564. is_valid_ent_char(C) ->
  565. (C >= $a andalso C =< $z) orelse (C >= $A andalso C =< $Z).
  566. is_valid_ent_val(C) ->
  567. (C >= $a andalso C =< $f) orelse (C >= $A andalso C =< $F)
  568. orelse (C >= $0 andalso C =< $9).