PageRenderTime 54ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/deps/mochiweb/src/mochiweb_html.erl

https://code.google.com/p/zotonic/
Erlang | 1352 lines | 1169 code | 105 blank | 78 comment | 2 complexity | 15edba7ca92133861eba435943f0e97f MD5 | raw file
Possible License(s): Apache-2.0, MIT, LGPL-2.1, BSD-3-Clause
  1. %% @author Bob Ippolito <bob@mochimedia.com>
  2. %% @copyright 2007 Mochi Media, Inc.
  3. %% @doc Loosely tokenizes and generates parse trees for HTML 4.
  4. -module(mochiweb_html).
  5. -export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1,
  6. escape_attr/1, to_html/1]).
  7. %% This is a macro to placate syntax highlighters..
  8. -define(QUOTE, $\").
  9. -define(SQUOTE, $\').
  10. -define(ADV_COL(S, N),
  11. S#decoder{column=N+S#decoder.column,
  12. offset=N+S#decoder.offset}).
  13. -define(INC_COL(S),
  14. S#decoder{column=1+S#decoder.column,
  15. offset=1+S#decoder.offset}).
  16. -define(INC_LINE(S),
  17. S#decoder{column=1,
  18. line=1+S#decoder.line,
  19. offset=1+S#decoder.offset}).
  20. -define(INC_CHAR(S, C),
  21. case C of
  22. $\n ->
  23. S#decoder{column=1,
  24. line=1+S#decoder.line,
  25. offset=1+S#decoder.offset};
  26. _ ->
  27. S#decoder{column=1+S#decoder.column,
  28. offset=1+S#decoder.offset}
  29. end).
  30. -define(IS_WHITESPACE(C),
  31. (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
  32. -define(IS_LITERAL_SAFE(C),
  33. ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)
  34. orelse (C >= $0 andalso C =< $9))).
  35. -define(PROBABLE_CLOSE(C),
  36. (C =:= $> orelse ?IS_WHITESPACE(C))).
  37. -record(decoder, {line=1,
  38. column=1,
  39. offset=0}).
  40. %% @type html_node() = {string(), [html_attr()], [html_node() | string()]}
  41. %% @type html_attr() = {string(), string()}
  42. %% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype()
  43. %% @type html_data() = {data, string(), Whitespace::boolean()}
  44. %% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()}
  45. %% @type end_tag() = {end_tag, Name}
  46. %% @type html_comment() = {comment, Comment}
  47. %% @type html_doctype() = {doctype, [Doctype]}
  48. %% @type inline_html() = {'=', iolist()}
  49. %% External API.
  50. %% @spec parse(string() | binary()) -> html_node()
  51. %% @doc tokenize and then transform the token stream into a HTML tree.
  52. parse(Input) ->
  53. parse_tokens(tokens(Input)).
  54. %% @spec parse_tokens([html_token()]) -> html_node()
  55. %% @doc Transform the output of tokens(Doc) into a HTML tree.
  56. parse_tokens(Tokens) when is_list(Tokens) ->
  57. %% Skip over doctype, processing instructions
  58. F = fun (X) ->
  59. case X of
  60. {start_tag, _, _, false} ->
  61. false;
  62. _ ->
  63. true
  64. end
  65. end,
  66. [{start_tag, Tag, Attrs, false} | Rest] = lists:dropwhile(F, Tokens),
  67. {Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
  68. Tree.
  69. %% @spec tokens(StringOrBinary) -> [html_token()]
  70. %% @doc Transform the input UTF-8 HTML into a token stream.
  71. tokens(Input) ->
  72. tokens(iolist_to_binary(Input), #decoder{}, []).
  73. %% @spec to_tokens(html_node()) -> [html_token()]
  74. %% @doc Convert a html_node() tree to a list of tokens.
  75. to_tokens({Tag0}) ->
  76. to_tokens({Tag0, [], []});
  77. to_tokens(T={'=', _}) ->
  78. [T];
  79. to_tokens(T={doctype, _}) ->
  80. [T];
  81. to_tokens(T={comment, _}) ->
  82. [T];
  83. to_tokens({Tag0, Acc}) ->
  84. %% This is only allowed in sub-tags: {p, [{"class", "foo"}]}
  85. to_tokens({Tag0, [], Acc});
  86. to_tokens({Tag0, Attrs, Acc}) ->
  87. Tag = to_tag(Tag0),
  88. to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, is_singleton(Tag)}]).
  89. %% @spec to_html([html_token()] | html_node()) -> iolist()
  90. %% @doc Convert a list of html_token() to a HTML document.
  91. to_html(Node) when is_tuple(Node) ->
  92. to_html(to_tokens(Node));
  93. to_html(Tokens) when is_list(Tokens) ->
  94. to_html(Tokens, []).
  95. %% @spec escape(string() | atom() | binary()) -> binary()
  96. %% @doc Escape a string such that it's safe for HTML (amp; lt; gt;).
  97. escape(B) when is_binary(B) ->
  98. escape(binary_to_list(B), []);
  99. escape(A) when is_atom(A) ->
  100. escape(atom_to_list(A), []);
  101. escape(S) when is_list(S) ->
  102. escape(S, []).
  103. %% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary()
  104. %% @doc Escape a string such that it's safe for HTML attrs
  105. %% (amp; lt; gt; quot;).
  106. escape_attr(B) when is_binary(B) ->
  107. escape_attr(binary_to_list(B), []);
  108. escape_attr(A) when is_atom(A) ->
  109. escape_attr(atom_to_list(A), []);
  110. escape_attr(S) when is_list(S) ->
  111. escape_attr(S, []);
  112. escape_attr(I) when is_integer(I) ->
  113. escape_attr(integer_to_list(I), []);
  114. escape_attr(F) when is_float(F) ->
  115. escape_attr(mochinum:digits(F), []).
  116. to_html(Tree, Acc) ->
  117. to_html(Tree, Acc, true).
  118. to_html([], Acc, _Escape) ->
  119. lists:reverse(Acc);
  120. to_html([{'=', Content} | Rest], Acc, Escape) ->
  121. to_html(Rest, [Content | Acc], Escape);
  122. to_html([{pi, Bin} | Rest], Acc, Escape) ->
  123. Open = [<<"<?">>,
  124. Bin,
  125. <<"?>">>],
  126. to_html(Rest, [Open | Acc], Escape);
  127. to_html([{pi, Tag, Attrs} | Rest], Acc, Escape) ->
  128. Open = [<<"<?">>,
  129. Tag,
  130. attrs_to_html(Attrs, []),
  131. <<"?>">>],
  132. to_html(Rest, [Open | Acc], Escape);
  133. to_html([{comment, Comment} | Rest], Acc, Escape) ->
  134. to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] | Acc], Escape);
  135. to_html([{doctype, Parts} | Rest], Acc, Escape) ->
  136. Inside = doctype_to_html(Parts, Acc),
  137. to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] | Acc], Escape);
  138. to_html([{data, Data, _Whitespace} | Rest], Acc, true) ->
  139. to_html(Rest, [escape(Data) | Acc], true);
  140. to_html([{data, Data, _Whitespace} | Rest], Acc, false) ->
  141. to_html(Rest, [Data | Acc], false);
  142. to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc, _Escape) ->
  143. EscapeData = case Tag of
  144. <<"script">> -> false;
  145. _ -> true
  146. end,
  147. Open = [<<"<">>,
  148. Tag,
  149. attrs_to_html(Attrs, []),
  150. case Singleton of
  151. true -> <<" />">>;
  152. false -> <<">">>
  153. end],
  154. to_html(Rest, [Open | Acc], EscapeData);
  155. to_html([{end_tag, Tag} | Rest], Acc, _Escape) ->
  156. to_html(Rest, [[<<"</">>, Tag, <<">">>] | Acc], false).
  157. doctype_to_html([], Acc) ->
  158. lists:reverse(Acc);
  159. doctype_to_html([Word | Rest], Acc) ->
  160. case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end,
  161. binary_to_list(iolist_to_binary(Word))) of
  162. true ->
  163. doctype_to_html(Rest, [[<<" ">>, Word] | Acc]);
  164. false ->
  165. doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc])
  166. end.
  167. attrs_to_html([], Acc) ->
  168. lists:reverse(Acc);
  169. attrs_to_html([{K, V} | Rest], Acc) ->
  170. attrs_to_html(Rest,
  171. [[<<" ">>, escape(K), <<"=\"">>,
  172. escape_attr(V), <<"\"">>] | Acc]).
  173. escape([], Acc) ->
  174. list_to_binary(lists:reverse(Acc));
  175. escape("<" ++ Rest, Acc) ->
  176. escape(Rest, lists:reverse("&lt;", Acc));
  177. escape(">" ++ Rest, Acc) ->
  178. escape(Rest, lists:reverse("&gt;", Acc));
  179. escape("&" ++ Rest, Acc) ->
  180. escape(Rest, lists:reverse("&amp;", Acc));
  181. escape([C | Rest], Acc) ->
  182. escape(Rest, [C | Acc]).
  183. escape_attr([], Acc) ->
  184. list_to_binary(lists:reverse(Acc));
  185. escape_attr("<" ++ Rest, Acc) ->
  186. escape_attr(Rest, lists:reverse("&lt;", Acc));
  187. escape_attr(">" ++ Rest, Acc) ->
  188. escape_attr(Rest, lists:reverse("&gt;", Acc));
  189. escape_attr("&" ++ Rest, Acc) ->
  190. escape_attr(Rest, lists:reverse("&amp;", Acc));
  191. escape_attr([?QUOTE | Rest], Acc) ->
  192. escape_attr(Rest, lists:reverse("&quot;", Acc));
  193. escape_attr([C | Rest], Acc) ->
  194. escape_attr(Rest, [C | Acc]).
  195. to_tag(A) when is_atom(A) ->
  196. norm(atom_to_list(A));
  197. to_tag(L) ->
  198. norm(L).
  199. to_tokens([], Acc) ->
  200. lists:reverse(Acc);
  201. to_tokens([{Tag, []} | Rest], Acc) ->
  202. to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]);
  203. to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) ->
  204. %% Allow {br}
  205. to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc);
  206. to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) ->
  207. %% Allow {'=', iolist()}
  208. to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
  209. to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) ->
  210. %% Allow {comment, iolist()}
  211. to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
  212. to_tokens([{Tag0, [T0={pi, _S0} | R1]} | Rest], Acc) ->
  213. %% Allow {pi, binary()}
  214. to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
  215. to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) ->
  216. %% Allow {pi, binary(), list()}
  217. to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
  218. to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) ->
  219. %% Allow {p, [{"class", "foo"}]}
  220. to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc);
  221. to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) ->
  222. %% Allow {p, "content"} and {p, <<"content">>}
  223. to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc);
  224. to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) ->
  225. %% Allow {"p", [{"class", "foo"}], <<"content">>}
  226. to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc);
  227. to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc)
  228. when is_integer(C) ->
  229. %% Allow {"p", [{"class", "foo"}], "content"}
  230. to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc);
  231. to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) ->
  232. %% Native {"p", [{"class", "foo"}], ["content"]}
  233. Tag = to_tag(Tag0),
  234. T1 = to_tag(T0),
  235. case is_singleton(norm(T1)) of
  236. true ->
  237. to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]);
  238. false ->
  239. to_tokens([{T1, C1}, {Tag, R1} | Rest],
  240. [{start_tag, T1, A1, false} | Acc])
  241. end;
  242. to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) ->
  243. %% List text
  244. Tag = to_tag(Tag0),
  245. to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]);
  246. to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) ->
  247. %% Binary text
  248. Tag = to_tag(Tag0),
  249. to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]).
  250. tokens(B, S=#decoder{offset=O}, Acc) ->
  251. case B of
  252. <<_:O/binary>> ->
  253. lists:reverse(Acc);
  254. _ ->
  255. {Tag, S1} = tokenize(B, S),
  256. case parse_flag(Tag) of
  257. script ->
  258. {Tag2, S2} = tokenize_script(B, S1),
  259. tokens(B, S2, [Tag2, Tag | Acc]);
  260. textarea ->
  261. {Tag2, S2} = tokenize_textarea(B, S1),
  262. tokens(B, S2, [Tag2, Tag | Acc]);
  263. none ->
  264. tokens(B, S1, [Tag | Acc])
  265. end
  266. end.
  267. parse_flag({start_tag, B, _, false}) ->
  268. case string:to_lower(binary_to_list(B)) of
  269. "script" ->
  270. script;
  271. "textarea" ->
  272. textarea;
  273. _ ->
  274. none
  275. end;
  276. parse_flag(_) ->
  277. none.
  278. tokenize(B, S=#decoder{offset=O}) ->
  279. case B of
  280. <<_:O/binary, "<!--", _/binary>> ->
  281. tokenize_comment(B, ?ADV_COL(S, 4));
  282. <<_:O/binary, "<!DOCTYPE", _/binary>> ->
  283. tokenize_doctype(B, ?ADV_COL(S, 10));
  284. <<_:O/binary, "<![CDATA[", _/binary>> ->
  285. tokenize_cdata(B, ?ADV_COL(S, 9));
  286. <<_:O/binary, "<?php", _/binary>> ->
  287. {Body, S1} = raw_qgt(B, ?ADV_COL(S, 2)),
  288. {{pi, Body}, S1};
  289. <<_:O/binary, "<?", _/binary>> ->
  290. {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
  291. {Attrs, S2} = tokenize_attributes(B, S1),
  292. S3 = find_qgt(B, S2),
  293. {{pi, Tag, Attrs}, S3};
  294. <<_:O/binary, "&", _/binary>> ->
  295. tokenize_charref(B, ?INC_COL(S));
  296. <<_:O/binary, "</", _/binary>> ->
  297. {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
  298. {S2, _} = find_gt(B, S1),
  299. {{end_tag, Tag}, S2};
  300. <<_:O/binary, "<", C, _/binary>> when ?IS_WHITESPACE(C) ->
  301. %% This isn't really strict HTML
  302. {{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)),
  303. {{data, <<$<, Data/binary>>, false}, S1};
  304. <<_:O/binary, "<", _/binary>> ->
  305. {Tag, S1} = tokenize_literal(B, ?INC_COL(S)),
  306. {Attrs, S2} = tokenize_attributes(B, S1),
  307. {S3, HasSlash} = find_gt(B, S2),
  308. Singleton = HasSlash orelse is_singleton(Tag),
  309. {{start_tag, Tag, Attrs, Singleton}, S3};
  310. _ ->
  311. tokenize_data(B, S)
  312. end.
  313. tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
  314. tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]);
  315. tree_data(Rest, AllWhitespace, Acc) ->
  316. {iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.
  317. tree([], Stack) ->
  318. {destack(Stack), []};
  319. tree([{end_tag, Tag} | Rest], Stack) ->
  320. case destack(norm(Tag), Stack) of
  321. S when is_list(S) ->
  322. tree(Rest, S);
  323. Result ->
  324. {Result, []}
  325. end;
  326. tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
  327. tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
  328. tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
  329. tree(Rest, stack(norm({Tag, Attrs}), S));
  330. tree([T={pi, _Raw} | Rest], S) ->
  331. tree(Rest, append_stack_child(T, S));
  332. tree([T={pi, _Tag, _Attrs} | Rest], S) ->
  333. tree(Rest, append_stack_child(T, S));
  334. tree([T={comment, _Comment} | Rest], S) ->
  335. tree(Rest, append_stack_child(T, S));
  336. tree(L=[{data, _Data, _Whitespace} | _], S) ->
  337. case tree_data(L, true, []) of
  338. {_, true, Rest} ->
  339. tree(Rest, S);
  340. {Data, false, Rest} ->
  341. tree(Rest, append_stack_child(Data, S))
  342. end;
  343. tree([{doctype, _} | Rest], Stack) ->
  344. tree(Rest, Stack).
  345. norm({Tag, Attrs}) ->
  346. {norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
  347. norm(Tag) when is_binary(Tag) ->
  348. Tag;
  349. norm(Tag) ->
  350. list_to_binary(string:to_lower(Tag)).
  351. stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest])
  352. when TN =:= <<"li">> orelse TN =:= <<"option">> ->
  353. [T1 | destack(TN, Stack)];
  354. stack(T1={TN0, _, _}, Stack=[{TN1, _, _} | _Rest])
  355. when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso
  356. (TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) ->
  357. [T1 | destack(TN1, Stack)];
  358. stack(T1, Stack) ->
  359. [T1 | Stack].
  360. append_stack_child(StartTag, [{Name, Attrs, Acc} | Stack]) ->
  361. [{Name, Attrs, [StartTag | Acc]} | Stack].
  362. destack(<<"br">>, Stack) ->
  363. %% This is an ugly hack to make dumb_br_test() pass,
  364. %% this makes it such that br can never have children.
  365. Stack;
  366. destack(TagName, Stack) when is_list(Stack) ->
  367. F = fun (X) ->
  368. case X of
  369. {TagName, _, _} ->
  370. false;
  371. _ ->
  372. true
  373. end
  374. end,
  375. case lists:splitwith(F, Stack) of
  376. {_, []} ->
  377. %% If we're parsing something like XML we might find
  378. %% a <link>tag</link> that is normally a singleton
  379. %% in HTML but isn't here
  380. case {is_singleton(TagName), Stack} of
  381. {true, [{T0, A0, Acc0} | Post0]} ->
  382. case lists:splitwith(F, Acc0) of
  383. {_, []} ->
  384. %% Actually was a singleton
  385. Stack;
  386. {Pre, [{T1, A1, Acc1} | Post1]} ->
  387. [{T0, A0, [{T1, A1, Acc1 ++ lists:reverse(Pre)} | Post1]}
  388. | Post0]
  389. end;
  390. _ ->
  391. %% No match, no state change
  392. Stack
  393. end;
  394. {_Pre, [_T]} ->
  395. %% Unfurl the whole stack, we're done
  396. destack(Stack);
  397. {Pre, [T, {T0, A0, Acc0} | Post]} ->
  398. %% Unfurl up to the tag, then accumulate it
  399. [{T0, A0, [destack(Pre ++ [T]) | Acc0]} | Post]
  400. end.
  401. destack([{Tag, Attrs, Acc}]) ->
  402. {Tag, Attrs, lists:reverse(Acc)};
  403. destack([{T1, A1, Acc1}, {T0, A0, Acc0} | Rest]) ->
  404. destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} | Acc0]} | Rest]).
  405. is_singleton(<<"br">>) -> true;
  406. is_singleton(<<"hr">>) -> true;
  407. is_singleton(<<"img">>) -> true;
  408. is_singleton(<<"input">>) -> true;
  409. is_singleton(<<"base">>) -> true;
  410. is_singleton(<<"meta">>) -> true;
  411. is_singleton(<<"link">>) -> true;
  412. is_singleton(<<"area">>) -> true;
  413. is_singleton(<<"param">>) -> true;
  414. is_singleton(<<"col">>) -> true;
  415. is_singleton(_) -> false.
  416. tokenize_data(B, S=#decoder{offset=O}) ->
  417. tokenize_data(B, S, O, true).
  418. tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) ->
  419. case B of
  420. <<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) ->
  421. tokenize_data(B, ?INC_CHAR(S, C), Start,
  422. (Whitespace andalso ?IS_WHITESPACE(C)));
  423. _ ->
  424. Len = O - Start,
  425. <<_:Start/binary, Data:Len/binary, _/binary>> = B,
  426. {{data, Data, Whitespace}, S}
  427. end.
  428. tokenize_attributes(B, S) ->
  429. tokenize_attributes(B, S, []).
  430. tokenize_attributes(B, S=#decoder{offset=O}, Acc) ->
  431. case B of
  432. <<_:O/binary>> ->
  433. {lists:reverse(Acc), S};
  434. <<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) ->
  435. {lists:reverse(Acc), S};
  436. <<_:O/binary, "?>", _/binary>> ->
  437. {lists:reverse(Acc), S};
  438. <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
  439. tokenize_attributes(B, ?INC_CHAR(S, C), Acc);
  440. _ ->
  441. {Attr, S1} = tokenize_literal(B, S),
  442. {Value, S2} = tokenize_attr_value(Attr, B, S1),
  443. tokenize_attributes(B, S2, [{Attr, Value} | Acc])
  444. end.
  445. tokenize_attr_value(Attr, B, S) ->
  446. S1 = skip_whitespace(B, S),
  447. O = S1#decoder.offset,
  448. case B of
  449. <<_:O/binary, "=", _/binary>> ->
  450. S2 = skip_whitespace(B, ?INC_COL(S1)),
  451. tokenize_quoted_or_unquoted_attr_value(B, S2);
  452. _ ->
  453. {Attr, S1}
  454. end.
  455. tokenize_quoted_or_unquoted_attr_value(B, S=#decoder{offset=O}) ->
  456. case B of
  457. <<_:O/binary>> ->
  458. { [], S };
  459. <<_:O/binary, Q, _/binary>> when Q =:= ?QUOTE orelse
  460. Q =:= ?SQUOTE ->
  461. tokenize_quoted_attr_value(B, ?INC_COL(S), [], Q);
  462. <<_:O/binary, _/binary>> ->
  463. tokenize_unquoted_attr_value(B, S, [])
  464. end.
  465. tokenize_quoted_attr_value(B, S=#decoder{offset=O}, Acc, Q) ->
  466. case B of
  467. <<_:O/binary>> ->
  468. { iolist_to_binary(lists:reverse(Acc)), S };
  469. <<_:O/binary, $&, _/binary>> ->
  470. {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
  471. tokenize_quoted_attr_value(B, S1, [Data|Acc], Q);
  472. <<_:O/binary, Q, _/binary>> ->
  473. { iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S) };
  474. <<_:O/binary, $\n, _/binary>> ->
  475. { iolist_to_binary(lists:reverse(Acc)), ?INC_LINE(S) };
  476. <<_:O/binary, C, _/binary>> ->
  477. tokenize_quoted_attr_value(B, ?INC_COL(S), [C|Acc], Q)
  478. end.
  479. tokenize_unquoted_attr_value(B, S=#decoder{offset=O}, Acc) ->
  480. case B of
  481. <<_:O/binary>> ->
  482. { iolist_to_binary(lists:reverse(Acc)), S };
  483. <<_:O/binary, $&, _/binary>> ->
  484. {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
  485. tokenize_unquoted_attr_value(B, S1, [Data|Acc]);
  486. <<_:O/binary, $/, $>, _/binary>> ->
  487. { iolist_to_binary(lists:reverse(Acc)), S };
  488. <<_:O/binary, C, _/binary>> when ?PROBABLE_CLOSE(C) ->
  489. { iolist_to_binary(lists:reverse(Acc)), S };
  490. <<_:O/binary, C, _/binary>> ->
  491. tokenize_unquoted_attr_value(B, ?INC_COL(S), [C|Acc])
  492. end.
  493. skip_whitespace(B, S=#decoder{offset=O}) ->
  494. case B of
  495. <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
  496. skip_whitespace(B, ?INC_CHAR(S, C));
  497. _ ->
  498. S
  499. end.
  500. tokenize_literal(Bin, S=#decoder{offset=O}) ->
  501. case Bin of
  502. <<_:O/binary, C, _/binary>> when C =:= $>
  503. orelse C =:= $/
  504. orelse C =:= $= ->
  505. %% Handle case where tokenize_literal would consume
  506. %% 0 chars. http://github.com/mochi/mochiweb/pull/13
  507. {[C], ?INC_COL(S)};
  508. _ ->
  509. tokenize_literal(Bin, S, [])
  510. end.
  511. tokenize_literal(Bin, S=#decoder{offset=O}, Acc) ->
  512. case Bin of
  513. <<_:O/binary, $&, _/binary>> ->
  514. {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
  515. tokenize_literal(Bin, S1, [Data | Acc]);
  516. <<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C)
  517. orelse C =:= $>
  518. orelse C =:= $/
  519. orelse C =:= $=) ->
  520. tokenize_literal(Bin, ?INC_COL(S), [C | Acc]);
  521. _ ->
  522. {iolist_to_binary(string:to_lower(lists:reverse(Acc))), S}
  523. end.
  524. raw_qgt(Bin, S=#decoder{offset=O}) ->
  525. raw_qgt(Bin, S, O).
  526. raw_qgt(Bin, S=#decoder{offset=O}, Start) ->
  527. case Bin of
  528. <<_:O/binary, "?>", _/binary>> ->
  529. Len = O - Start,
  530. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  531. {Raw, ?ADV_COL(S, 2)};
  532. <<_:O/binary, C, _/binary>> ->
  533. raw_qgt(Bin, ?INC_CHAR(S, C), Start);
  534. <<_:O/binary>> ->
  535. <<_:Start/binary, Raw/binary>> = Bin,
  536. {Raw, S}
  537. end.
  538. find_qgt(Bin, S=#decoder{offset=O}) ->
  539. case Bin of
  540. <<_:O/binary, "?>", _/binary>> ->
  541. ?ADV_COL(S, 2);
  542. <<_:O/binary, ">", _/binary>> ->
  543. ?ADV_COL(S, 1);
  544. <<_:O/binary, "/>", _/binary>> ->
  545. ?ADV_COL(S, 2);
  546. %% tokenize_attributes takes care of this state:
  547. %% <<_:O/binary, C, _/binary>> ->
  548. %% find_qgt(Bin, ?INC_CHAR(S, C));
  549. <<_:O/binary>> ->
  550. S
  551. end.
  552. find_gt(Bin, S) ->
  553. find_gt(Bin, S, false).
  554. find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->
  555. case Bin of
  556. <<_:O/binary, $/, _/binary>> ->
  557. find_gt(Bin, ?INC_COL(S), true);
  558. <<_:O/binary, $>, _/binary>> ->
  559. {?INC_COL(S), HasSlash};
  560. <<_:O/binary, C, _/binary>> ->
  561. find_gt(Bin, ?INC_CHAR(S, C), HasSlash);
  562. _ ->
  563. {S, HasSlash}
  564. end.
  565. tokenize_charref(Bin, S=#decoder{offset=O}) ->
  566. tokenize_charref(Bin, S, O).
  567. tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
  568. case Bin of
  569. <<_:O/binary>> ->
  570. <<_:Start/binary, Raw/binary>> = Bin,
  571. {{data, Raw, false}, S};
  572. <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C)
  573. orelse C =:= ?SQUOTE
  574. orelse C =:= ?QUOTE
  575. orelse C =:= $/
  576. orelse C =:= $<
  577. orelse C =:= $>
  578. orelse C =:= $& ->
  579. Len = O - Start,
  580. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  581. Data = case mochiweb_charref:charref(Raw) of
  582. undefined ->
  583. Start1 = Start - 1,
  584. Len1 = Len + 1,
  585. <<_:Start1/binary, R:Len1/binary, _/binary>> = Bin,
  586. R;
  587. Unichar ->
  588. mochiutf8:codepoint_to_bytes(Unichar)
  589. end,
  590. {{data, Data, false}, S};
  591. <<_:O/binary, $;, _/binary>> ->
  592. Len = O - Start,
  593. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  594. Data = case mochiweb_charref:charref(Raw) of
  595. undefined ->
  596. Start1 = Start - 1,
  597. Len1 = Len + 2,
  598. <<_:Start1/binary, R:Len1/binary, _/binary>> = Bin,
  599. R;
  600. Unichar ->
  601. mochiutf8:codepoint_to_bytes(Unichar)
  602. end,
  603. {{data, Data, false}, ?INC_COL(S)};
  604. _ ->
  605. tokenize_charref(Bin, ?INC_COL(S), Start)
  606. end.
  607. tokenize_doctype(Bin, S) ->
  608. tokenize_doctype(Bin, S, []).
  609. tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) ->
  610. case Bin of
  611. <<_:O/binary>> ->
  612. {{doctype, lists:reverse(Acc)}, S};
  613. <<_:O/binary, $>, _/binary>> ->
  614. {{doctype, lists:reverse(Acc)}, ?INC_COL(S)};
  615. <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
  616. tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc);
  617. _ ->
  618. {Word, S1} = tokenize_word_or_literal(Bin, S),
  619. tokenize_doctype(Bin, S1, [Word | Acc])
  620. end.
  621. tokenize_word_or_literal(Bin, S=#decoder{offset=O}) ->
  622. case Bin of
  623. <<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE ->
  624. tokenize_word(Bin, ?INC_COL(S), C);
  625. <<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) ->
  626. %% Sanity check for whitespace
  627. tokenize_literal(Bin, S)
  628. end.
  629. tokenize_word(Bin, S, Quote) ->
  630. tokenize_word(Bin, S, Quote, []).
  631. tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) ->
  632. case Bin of
  633. <<_:O/binary>> ->
  634. {iolist_to_binary(lists:reverse(Acc)), S};
  635. <<_:O/binary, Quote, _/binary>> ->
  636. {iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)};
  637. <<_:O/binary, $&, _/binary>> ->
  638. {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
  639. tokenize_word(Bin, S1, Quote, [Data | Acc]);
  640. <<_:O/binary, C, _/binary>> ->
  641. tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C | Acc])
  642. end.
  643. tokenize_cdata(Bin, S=#decoder{offset=O}) ->
  644. tokenize_cdata(Bin, S, O).
  645. tokenize_cdata(Bin, S=#decoder{offset=O}, Start) ->
  646. case Bin of
  647. <<_:O/binary, "]]>", _/binary>> ->
  648. Len = O - Start,
  649. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  650. {{data, Raw, false}, ?ADV_COL(S, 3)};
  651. <<_:O/binary, C, _/binary>> ->
  652. tokenize_cdata(Bin, ?INC_CHAR(S, C), Start);
  653. _ ->
  654. <<_:O/binary, Raw/binary>> = Bin,
  655. {{data, Raw, false}, S}
  656. end.
  657. tokenize_comment(Bin, S=#decoder{offset=O}) ->
  658. tokenize_comment(Bin, S, O).
  659. tokenize_comment(Bin, S=#decoder{offset=O}, Start) ->
  660. case Bin of
  661. <<_:O/binary, "-->", _/binary>> ->
  662. Len = O - Start,
  663. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  664. {{comment, Raw}, ?ADV_COL(S, 3)};
  665. <<_:O/binary, C, _/binary>> ->
  666. tokenize_comment(Bin, ?INC_CHAR(S, C), Start);
  667. <<_:Start/binary, Raw/binary>> ->
  668. {{comment, Raw}, S}
  669. end.
  670. tokenize_script(Bin, S=#decoder{offset=O}) ->
  671. tokenize_script(Bin, S, O).
  672. tokenize_script(Bin, S=#decoder{offset=O}, Start) ->
  673. case Bin of
  674. %% Just a look-ahead, we want the end_tag separately
  675. <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>>
  676. when (SS =:= $s orelse SS =:= $S) andalso
  677. (CC =:= $c orelse CC =:= $C) andalso
  678. (RR =:= $r orelse RR =:= $R) andalso
  679. (II =:= $i orelse II =:= $I) andalso
  680. (PP =:= $p orelse PP =:= $P) andalso
  681. (TT=:= $t orelse TT =:= $T) andalso
  682. ?PROBABLE_CLOSE(ZZ) ->
  683. Len = O - Start,
  684. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  685. {{data, Raw, false}, S};
  686. <<_:O/binary, C, _/binary>> ->
  687. tokenize_script(Bin, ?INC_CHAR(S, C), Start);
  688. <<_:Start/binary, Raw/binary>> ->
  689. {{data, Raw, false}, S}
  690. end.
  691. tokenize_textarea(Bin, S=#decoder{offset=O}) ->
  692. tokenize_textarea(Bin, S, O).
  693. tokenize_textarea(Bin, S=#decoder{offset=O}, Start) ->
  694. case Bin of
  695. %% Just a look-ahead, we want the end_tag separately
  696. <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>>
  697. when (TT =:= $t orelse TT =:= $T) andalso
  698. (EE =:= $e orelse EE =:= $E) andalso
  699. (XX =:= $x orelse XX =:= $X) andalso
  700. (TT2 =:= $t orelse TT2 =:= $T) andalso
  701. (AA =:= $a orelse AA =:= $A) andalso
  702. (RR =:= $r orelse RR =:= $R) andalso
  703. (EE2 =:= $e orelse EE2 =:= $E) andalso
  704. (AA2 =:= $a orelse AA2 =:= $A) andalso
  705. ?PROBABLE_CLOSE(ZZ) ->
  706. Len = O - Start,
  707. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  708. {{data, Raw, false}, S};
  709. <<_:O/binary, C, _/binary>> ->
  710. tokenize_textarea(Bin, ?INC_CHAR(S, C), Start);
  711. <<_:Start/binary, Raw/binary>> ->
  712. {{data, Raw, false}, S}
  713. end.
  714. %%
  715. %% Tests
  716. %%
  717. -ifdef(TEST).
  718. -include_lib("eunit/include/eunit.hrl").
  719. to_html_test() ->
  720. ?assertEqual(
  721. <<"<html><head><title>hey!</title></head><body><p class=\"foo\">what's up<br /></p><div>sucka</div>RAW!<!-- comment! --></body></html>">>,
  722. iolist_to_binary(
  723. to_html({html, [],
  724. [{<<"head">>, [],
  725. [{title, <<"hey!">>}]},
  726. {body, [],
  727. [{p, [{class, foo}], [<<"what's">>, <<" up">>, {br}]},
  728. {'div', <<"sucka">>},
  729. {'=', <<"RAW!">>},
  730. {comment, <<" comment! ">>}]}]}))),
  731. ?assertEqual(
  732. <<"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">">>,
  733. iolist_to_binary(
  734. to_html({doctype,
  735. [<<"html">>, <<"PUBLIC">>,
  736. <<"-//W3C//DTD XHTML 1.0 Transitional//EN">>,
  737. <<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">>]}))),
  738. ?assertEqual(
  739. <<"<html><?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?></html>">>,
  740. iolist_to_binary(
  741. to_html({<<"html">>,[],
  742. [{pi, <<"xml:namespace">>,
  743. [{<<"prefix">>,<<"o">>},
  744. {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]}))),
  745. ok.
  746. escape_test() ->
  747. ?assertEqual(
  748. <<"&amp;quot;\"word &gt;&lt;&lt;up!&amp;quot;">>,
  749. escape(<<"&quot;\"word ><<up!&quot;">>)),
  750. ?assertEqual(
  751. <<"&amp;quot;\"word &gt;&lt;&lt;up!&amp;quot;">>,
  752. escape("&quot;\"word ><<up!&quot;")),
  753. ?assertEqual(
  754. <<"&amp;quot;\"word &gt;&lt;&lt;up!&amp;quot;">>,
  755. escape('&quot;\"word ><<up!&quot;')),
  756. ok.
  757. escape_attr_test() ->
  758. ?assertEqual(
  759. <<"&amp;quot;&quot;word &gt;&lt;&lt;up!&amp;quot;">>,
  760. escape_attr(<<"&quot;\"word ><<up!&quot;">>)),
  761. ?assertEqual(
  762. <<"&amp;quot;&quot;word &gt;&lt;&lt;up!&amp;quot;">>,
  763. escape_attr("&quot;\"word ><<up!&quot;")),
  764. ?assertEqual(
  765. <<"&amp;quot;&quot;word &gt;&lt;&lt;up!&amp;quot;">>,
  766. escape_attr('&quot;\"word ><<up!&quot;')),
  767. ?assertEqual(
  768. <<"12345">>,
  769. escape_attr(12345)),
  770. ?assertEqual(
  771. <<"1.5">>,
  772. escape_attr(1.5)),
  773. ok.
  774. tokens_test() ->
  775. ?assertEqual(
  776. [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>},
  777. {<<"wibble">>, <<"wibble">>},
  778. {<<"alice">>, <<"bob">>}], true}],
  779. tokens(<<"<foo bar=baz wibble='wibble' alice=\"bob\"/>">>)),
  780. ?assertEqual(
  781. [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>},
  782. {<<"wibble">>, <<"wibble">>},
  783. {<<"alice">>, <<"bob">>}], true}],
  784. tokens(<<"<foo bar=baz wibble='wibble' alice=bob/>">>)),
  785. ?assertEqual(
  786. [{comment, <<"[if lt IE 7]>\n<style type=\"text/css\">\n.no_ie { display: none; }\n</style>\n<![endif]">>}],
  787. tokens(<<"<!--[if lt IE 7]>\n<style type=\"text/css\">\n.no_ie { display: none; }\n</style>\n<![endif]-->">>)),
  788. ?assertEqual(
  789. [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
  790. {data, <<" A= B <= C ">>, false},
  791. {end_tag, <<"script">>}],
  792. tokens(<<"<script type=\"text/javascript\"> A= B <= C </script>">>)),
  793. ?assertEqual(
  794. [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
  795. {data, <<" A= B <= C ">>, false},
  796. {end_tag, <<"script">>}],
  797. tokens(<<"<script type =\"text/javascript\"> A= B <= C </script>">>)),
  798. ?assertEqual(
  799. [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
  800. {data, <<" A= B <= C ">>, false},
  801. {end_tag, <<"script">>}],
  802. tokens(<<"<script type = \"text/javascript\"> A= B <= C </script>">>)),
  803. ?assertEqual(
  804. [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
  805. {data, <<" A= B <= C ">>, false},
  806. {end_tag, <<"script">>}],
  807. tokens(<<"<script type= \"text/javascript\"> A= B <= C </script>">>)),
  808. ?assertEqual(
  809. [{start_tag, <<"textarea">>, [], false},
  810. {data, <<"<html></body>">>, false},
  811. {end_tag, <<"textarea">>}],
  812. tokens(<<"<textarea><html></body></textarea>">>)),
  813. ?assertEqual(
  814. [{start_tag, <<"textarea">>, [], false},
  815. {data, <<"<html></body></textareaz>">>, false}],
  816. tokens(<<"<textarea ><html></body></textareaz>">>)),
  817. ?assertEqual(
  818. [{pi, <<"xml:namespace">>,
  819. [{<<"prefix">>,<<"o">>},
  820. {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
  821. tokens(<<"<?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?>">>)),
  822. ?assertEqual(
  823. [{pi, <<"xml:namespace">>,
  824. [{<<"prefix">>,<<"o">>},
  825. {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
  826. tokens(<<"<?xml:namespace prefix=o ns=urn:schemas-microsoft-com:office:office \n?>">>)),
  827. ?assertEqual(
  828. [{pi, <<"xml:namespace">>,
  829. [{<<"prefix">>,<<"o">>},
  830. {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
  831. tokens(<<"<?xml:namespace prefix=o ns=urn:schemas-microsoft-com:office:office">>)),
  832. ?assertEqual(
  833. [{data, <<"<">>, false}],
  834. tokens(<<"&lt;">>)),
  835. ?assertEqual(
  836. [{data, <<"not html ">>, false},
  837. {data, <<"< at all">>, false}],
  838. tokens(<<"not html < at all">>)),
  839. ok.
  840. parse_test() ->
  841. D0 = <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">
  842. <html>
  843. <head>
  844. <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
  845. <title>Foo</title>
  846. <link rel=\"stylesheet\" type=\"text/css\" href=\"/static/rel/dojo/resources/dojo.css\" media=\"screen\">
  847. <link rel=\"stylesheet\" type=\"text/css\" href=\"/static/foo.css\" media=\"screen\">
  848. <!--[if lt IE 7]>
  849. <style type=\"text/css\">
  850. .no_ie { display: none; }
  851. </style>
  852. <![endif]-->
  853. <link rel=\"icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
  854. <link rel=\"shortcut icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
  855. </head>
  856. <body id=\"home\" class=\"tundra\"><![CDATA[&lt;<this<!-- is -->CDATA>&gt;]]></body>
  857. </html>">>,
  858. ?assertEqual(
  859. {<<"html">>, [],
  860. [{<<"head">>, [],
  861. [{<<"meta">>,
  862. [{<<"http-equiv">>,<<"Content-Type">>},
  863. {<<"content">>,<<"text/html; charset=UTF-8">>}],
  864. []},
  865. {<<"title">>,[],[<<"Foo">>]},
  866. {<<"link">>,
  867. [{<<"rel">>,<<"stylesheet">>},
  868. {<<"type">>,<<"text/css">>},
  869. {<<"href">>,<<"/static/rel/dojo/resources/dojo.css">>},
  870. {<<"media">>,<<"screen">>}],
  871. []},
  872. {<<"link">>,
  873. [{<<"rel">>,<<"stylesheet">>},
  874. {<<"type">>,<<"text/css">>},
  875. {<<"href">>,<<"/static/foo.css">>},
  876. {<<"media">>,<<"screen">>}],
  877. []},
  878. {comment,<<"[if lt IE 7]>\n <style type=\"text/css\">\n .no_ie { display: none; }\n </style>\n <![endif]">>},
  879. {<<"link">>,
  880. [{<<"rel">>,<<"icon">>},
  881. {<<"href">>,<<"/static/images/favicon.ico">>},
  882. {<<"type">>,<<"image/x-icon">>}],
  883. []},
  884. {<<"link">>,
  885. [{<<"rel">>,<<"shortcut icon">>},
  886. {<<"href">>,<<"/static/images/favicon.ico">>},
  887. {<<"type">>,<<"image/x-icon">>}],
  888. []}]},
  889. {<<"body">>,
  890. [{<<"id">>,<<"home">>},
  891. {<<"class">>,<<"tundra">>}],
  892. [<<"&lt;<this<!-- is -->CDATA>&gt;">>]}]},
  893. parse(D0)),
  894. ?assertEqual(
  895. {<<"html">>,[],
  896. [{pi, <<"xml:namespace">>,
  897. [{<<"prefix">>,<<"o">>},
  898. {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]},
  899. parse(
  900. <<"<html><?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?></html>">>)),
  901. ?assertEqual(
  902. {<<"html">>, [],
  903. [{<<"dd">>, [], [<<"foo">>]},
  904. {<<"dt">>, [], [<<"bar">>]}]},
  905. parse(<<"<html><dd>foo<dt>bar</html>">>)),
  906. %% Singleton sadness
  907. ?assertEqual(
  908. {<<"html">>, [],
  909. [{<<"link">>, [], []},
  910. <<"foo">>,
  911. {<<"br">>, [], []},
  912. <<"bar">>]},
  913. parse(<<"<html><link>foo<br>bar</html>">>)),
  914. ?assertEqual(
  915. {<<"html">>, [],
  916. [{<<"link">>, [], [<<"foo">>,
  917. {<<"br">>, [], []},
  918. <<"bar">>]}]},
  919. parse(<<"<html><link>foo<br>bar</link></html>">>)),
  920. %% Case insensitive tags
  921. ?assertEqual(
  922. {<<"html">>, [],
  923. [{<<"head">>, [], [<<"foo">>,
  924. {<<"br">>, [], []},
  925. <<"BAR">>]},
  926. {<<"body">>, [{<<"class">>, <<"">>}, {<<"bgcolor">>, <<"#Aa01fF">>}], []}
  927. ]},
  928. parse(<<"<html><Head>foo<bR>BAR</head><body Class=\"\" bgcolor=\"#Aa01fF\"></BODY></html>">>)),
  929. ok.
  930. exhaustive_is_singleton_test() ->
  931. T = mochiweb_cover:clause_lookup_table(?MODULE, is_singleton),
  932. [?assertEqual(V, is_singleton(K)) || {K, V} <- T].
  933. tokenize_attributes_test() ->
  934. ?assertEqual(
  935. {<<"foo">>,
  936. [{<<"bar">>, <<"b\"az">>},
  937. {<<"wibble">>, <<"wibble">>},
  938. {<<"taco", 16#c2, 16#a9>>, <<"bell">>},
  939. {<<"quux">>, <<"quux">>}],
  940. []},
  941. parse(<<"<foo bar=\"b&quot;az\" wibble taco&copy;=bell quux">>)),
  942. ok.
  943. tokens2_test() ->
  944. D0 = <<"<channel><title>from __future__ import *</title><link>http://bob.pythonmac.org</link><description>Bob's Rants</description></channel>">>,
  945. ?assertEqual(
  946. [{start_tag,<<"channel">>,[],false},
  947. {start_tag,<<"title">>,[],false},
  948. {data,<<"from __future__ import *">>,false},
  949. {end_tag,<<"title">>},
  950. {start_tag,<<"link">>,[],true},
  951. {data,<<"http://bob.pythonmac.org">>,false},
  952. {end_tag,<<"link">>},
  953. {start_tag,<<"description">>,[],false},
  954. {data,<<"Bob's Rants">>,false},
  955. {end_tag,<<"description">>},
  956. {end_tag,<<"channel">>}],
  957. tokens(D0)),
  958. ok.
  959. to_tokens_test() ->
  960. ?assertEqual(
  961. [{start_tag, <<"p">>, [{class, 1}], false},
  962. {end_tag, <<"p">>}],
  963. to_tokens({p, [{class, 1}], []})),
  964. ?assertEqual(
  965. [{start_tag, <<"p">>, [], false},
  966. {end_tag, <<"p">>}],
  967. to_tokens({p})),
  968. ?assertEqual(
  969. [{'=', <<"data">>}],
  970. to_tokens({'=', <<"data">>})),
  971. ?assertEqual(
  972. [{comment, <<"comment">>}],
  973. to_tokens({comment, <<"comment">>})),
  974. %% This is only allowed in sub-tags:
  975. %% {p, [{"class", "foo"}]} as {p, [{"class", "foo"}], []}
  976. %% On the outside it's always treated as follows:
  977. %% {p, [], [{"class", "foo"}]} as {p, [], [{"class", "foo"}]}
  978. ?assertEqual(
  979. [{start_tag, <<"html">>, [], false},
  980. {start_tag, <<"p">>, [{class, 1}], false},
  981. {end_tag, <<"p">>},
  982. {end_tag, <<"html">>}],
  983. to_tokens({html, [{p, [{class, 1}]}]})),
  984. ok.
  985. parse2_test() ->
  986. D0 = <<"<channel><title>from __future__ import *</title><link>http://bob.pythonmac.org<br>foo</link><description>Bob's Rants</description></channel>">>,
  987. ?assertEqual(
  988. {<<"channel">>,[],
  989. [{<<"title">>,[],[<<"from __future__ import *">>]},
  990. {<<"link">>,[],[
  991. <<"http://bob.pythonmac.org">>,
  992. {<<"br">>,[],[]},
  993. <<"foo">>]},
  994. {<<"description">>,[],[<<"Bob's Rants">>]}]},
  995. parse(D0)),
  996. ok.
  997. parse_tokens_test() ->
  998. D0 = [{doctype,[<<"HTML">>,<<"PUBLIC">>,<<"-//W3C//DTD HTML 4.01 Transitional//EN">>]},
  999. {data,<<"\n">>,true},
  1000. {start_tag,<<"html">>,[],false}],
  1001. ?assertEqual(
  1002. {<<"html">>, [], []},
  1003. parse_tokens(D0)),
  1004. D1 = D0 ++ [{end_tag, <<"html">>}],
  1005. ?assertEqual(
  1006. {<<"html">>, [], []},
  1007. parse_tokens(D1)),
  1008. D2 = D0 ++ [{start_tag, <<"body">>, [], false}],
  1009. ?assertEqual(
  1010. {<<"html">>, [], [{<<"body">>, [], []}]},
  1011. parse_tokens(D2)),
  1012. D3 = D0 ++ [{start_tag, <<"head">>, [], false},
  1013. {end_tag, <<"head">>},
  1014. {start_tag, <<"body">>, [], false}],
  1015. ?assertEqual(
  1016. {<<"html">>, [], [{<<"head">>, [], []}, {<<"body">>, [], []}]},
  1017. parse_tokens(D3)),
  1018. D4 = D3 ++ [{data,<<"\n">>,true},
  1019. {start_tag,<<"div">>,[{<<"class">>,<<"a">>}],false},
  1020. {start_tag,<<"a">>,[{<<"name">>,<<"#anchor">>}],false},
  1021. {end_tag,<<"a">>},
  1022. {end_tag,<<"div">>},
  1023. {start_tag,<<"div">>,[{<<"class">>,<<"b">>}],false},
  1024. {start_tag,<<"div">>,[{<<"class">>,<<"c">>}],false},
  1025. {end_tag,<<"div">>},
  1026. {end_tag,<<"div">>}],
  1027. ?assertEqual(
  1028. {<<"html">>, [],
  1029. [{<<"head">>, [], []},
  1030. {<<"body">>, [],
  1031. [{<<"div">>, [{<<"class">>, <<"a">>}], [{<<"a">>, [{<<"name">>, <<"#anchor">>}], []}]},
  1032. {<<"div">>, [{<<"class">>, <<"b">>}], [{<<"div">>, [{<<"class">>, <<"c">>}], []}]}
  1033. ]}]},
  1034. parse_tokens(D4)),
  1035. D5 = [{start_tag,<<"html">>,[],false},
  1036. {data,<<"\n">>,true},
  1037. {data,<<"boo">>,false},
  1038. {data,<<"hoo">>,false},
  1039. {data,<<"\n">>,true},
  1040. {end_tag,<<"html">>}],
  1041. ?assertEqual(
  1042. {<<"html">>, [], [<<"\nboohoo\n">>]},
  1043. parse_tokens(D5)),
  1044. D6 = [{start_tag,<<"html">>,[],false},
  1045. {data,<<"\n">>,true},
  1046. {data,<<"\n">>,true},
  1047. {end_tag,<<"html">>}],
  1048. ?assertEqual(
  1049. {<<"html">>, [], []},
  1050. parse_tokens(D6)),
  1051. D7 = [{start_tag,<<"html">>,[],false},
  1052. {start_tag,<<"ul">>,[],false},
  1053. {start_tag,<<"li">>,[],false},
  1054. {data,<<"word">>,false},
  1055. {start_tag,<<"li">>,[],false},
  1056. {data,<<"up">>,false},
  1057. {end_tag,<<"li">>},
  1058. {start_tag,<<"li">>,[],false},
  1059. {data,<<"fdsa">>,false},
  1060. {start_tag,<<"br">>,[],true},
  1061. {data,<<"asdf">>,false},
  1062. {end_tag,<<"ul">>},
  1063. {end_tag,<<"html">>}],
  1064. ?assertEqual(
  1065. {<<"html">>, [],
  1066. [{<<"ul">>, [],
  1067. [{<<"li">>, [], [<<"word">>]},
  1068. {<<"li">>, [], [<<"up">>]},
  1069. {<<"li">>, [], [<<"fdsa">>,{<<"br">>, [], []}, <<"asdf">>]}]}]},
  1070. parse_tokens(D7)),
  1071. ok.
  1072. destack_test() ->
  1073. {<<"a">>, [], []} =
  1074. destack([{<<"a">>, [], []}]),
  1075. {<<"a">>, [], [{<<"b">>, [], []}]} =
  1076. destack([{<<"b">>, [], []}, {<<"a">>, [], []}]),
  1077. {<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]} =
  1078. destack([{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
  1079. [{<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]}] =
  1080. destack(<<"b">>,
  1081. [{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
  1082. [{<<"b">>, [], [{<<"c">>, [], []}]}, {<<"a">>, [], []}] =
  1083. destack(<<"c">>,
  1084. [{<<"c">>, [], []}, {<<"b">>, [], []},{<<"a">>, [], []}]),
  1085. ok.
  1086. doctype_test() ->
  1087. ?assertEqual(
  1088. {<<"html">>,[],[{<<"head">>,[],[]}]},
  1089. mochiweb_html:parse("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"
  1090. "<html><head></head></body></html>")),
  1091. %% http://code.google.com/p/mochiweb/issues/detail?id=52
  1092. ?assertEqual(
  1093. {<<"html">>,[],[{<<"head">>,[],[]}]},
  1094. mochiweb_html:parse("<html>"
  1095. "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"
  1096. "<head></head></body></html>")),
  1097. %% http://github.com/mochi/mochiweb/pull/13
  1098. ?assertEqual(
  1099. {<<"html">>,[],[{<<"head">>,[],[]}]},
  1100. mochiweb_html:parse("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\"/>"
  1101. "<html>"
  1102. "<head></head></body></html>")),
  1103. ok.
  1104. dumb_br_test() ->
  1105. %% http://code.google.com/p/mochiweb/issues/detail?id=71
  1106. ?assertEqual(
  1107. {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>]},
  1108. mochiweb_html:parse("<div><br/><br/>z</br/></br/></div>")),
  1109. ?assertEqual(
  1110. {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>]},
  1111. mochiweb_html:parse("<div><br><br>z</br/></br/></div>")),
  1112. ?assertEqual(
  1113. {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>, {<<"br">>, [], []}, {<<"br">>, [], []}]},
  1114. mochiweb_html:parse("<div><br><br>z<br/><br/></div>")),
  1115. ?assertEqual(
  1116. {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>]},
  1117. mochiweb_html:parse("<div><br><br>z</br></br></div>")).
  1118. php_test() ->
  1119. %% http://code.google.com/p/mochiweb/issues/detail?id=71
  1120. ?assertEqual(
  1121. [{pi, <<"php\n">>}],
  1122. mochiweb_html:tokens(
  1123. "<?php\n?>")),
  1124. ?assertEqual(
  1125. {<<"div">>, [], [{pi, <<"php\n">>}]},
  1126. mochiweb_html:parse(
  1127. "<div><?php\n?></div>")),
  1128. ok.
  1129. parse_unquoted_attr_test() ->
  1130. D0 = <<"<html><img src=/images/icon.png/></html>">>,
  1131. ?assertEqual(
  1132. {<<"html">>,[],[
  1133. { <<"img">>, [ { <<"src">>, <<"/images/icon.png">> } ], [] }
  1134. ]},
  1135. mochiweb_html:parse(D0)),
  1136. D1 = <<"<html><img src=/images/icon.png></img></html>">>,
  1137. ?assertEqual(
  1138. {<<"html">>,[],[
  1139. { <<"img">>, [ { <<"src">>, <<"/images/icon.png">> } ], [] }
  1140. ]},
  1141. mochiweb_html:parse(D1)),
  1142. D2 = <<"<html><img src=/images/icon&gt;.png width=100></img></html>">>,
  1143. ?assertEqual(
  1144. {<<"html">>,[],[
  1145. { <<"img">>, [ { <<"src">>, <<"/images/icon>.png">> }, { <<"width">>, <<"100">> } ], [] }
  1146. ]},
  1147. mochiweb_html:parse(D2)),
  1148. ok.
  1149. parse_quoted_attr_test() ->
  1150. D0 = <<"<html><img src='/images/icon.png'></html>">>,
  1151. ?assertEqual(
  1152. {<<"html">>,[],[
  1153. { <<"img">>, [ { <<"src">>, <<"/images/icon.png">> } ], [] }
  1154. ]},
  1155. mochiweb_html:parse(D0)),
  1156. D1 = <<"<html><img src=\"/images/icon.png'></html>">>,
  1157. ?assertEqual(
  1158. {<<"html">>,[],[
  1159. { <<"img">>, [ { <<"src">>, <<"/images/icon.png'></html>">> } ], [] }
  1160. ]},
  1161. mochiweb_html:parse(D1)),
  1162. D2 = <<"<html><img src=\"/images/icon&gt;.png\"></html>">>,
  1163. ?assertEqual(
  1164. {<<"html">>,[],[
  1165. { <<"img">>, [ { <<"src">>, <<"/images/icon>.png">> } ], [] }
  1166. ]},
  1167. mochiweb_html:parse(D2)),
  1168. ok.
  1169. parse_missing_attr_name_test() ->
  1170. D0 = <<"<html =black></html>">>,
  1171. ?assertEqual(
  1172. {<<"html">>, [ { <<"=">>, <<"=">> }, { <<"black">>, <<"black">> } ], [] },
  1173. mochiweb_html:parse(D0)),
  1174. ok.
  1175. parse_amps_attr_test() ->
  1176. D0 = <<"<a href=\"/hello?test=1&amp;that=2\"></a>">>,
  1177. ?assertEqual(
  1178. {<<"a">>, [ { <<"href">>, <<"/hello?test=1&that=2">> }], [] },
  1179. mochiweb_html:parse(D0)),
  1180. D1 = <<"<a href=\"/hello?test=1&that=2\"></a>">>,
  1181. ?assertEqual(
  1182. {<<"a">>, [ { <<"href">>, <<"/hello?test=1&that=2">> }], [] },
  1183. mochiweb_html:parse(D1)),
  1184. D2 = <<"<a href=\"/hello?test=123&that=2&amp;this=too\"></a>">>,
  1185. ?assertEqual(
  1186. {<<"a">>, [ { <<"href">>, <<"/hello?test=123&that=2&this=too">> }], [] },
  1187. mochiweb_html:parse(D2)),
  1188. D3 = <<"<a href=\"/product/54?c=hk-machine&id=1008&shop=auto-oko-74-H\"></a>">>,
  1189. ?assertEqual(
  1190. {<<"a">>, [ { <<"href">>, <<"/product/54?c=hk-machine&id=1008&shop=auto-oko-74-H">> }], [] },
  1191. mochiweb_html:parse(D3)),
  1192. D4 = <<"<a href=\"test?a=1&amp=1008\"></a>">>,
  1193. ?assertEqual(
  1194. {<<"a">>, [ { <<"href">>, <<"test?a=1&amp=1008">> }], [] },
  1195. mochiweb_html:parse(D4)),
  1196. ok.
  1197. parse_broken_pi_test() ->
  1198. D0 = <<"<html><?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" /></html>">>,
  1199. ?assertEqual(
  1200. {<<"html">>, [], [
  1201. { pi, <<"xml:namespace">>, [ { <<"prefix">>, <<"o">> },
  1202. { <<"ns">>, <<"urn:schemas-microsoft-com:office:office">> } ] }
  1203. ] },
  1204. mochiweb_html:parse(D0)),
  1205. ok.
  1206. parse_funny_singletons_test() ->
  1207. D0 = <<"<html><input><input>x</input></input></html>">>,
  1208. ?assertEqual(
  1209. {<<"html">>, [], [
  1210. { <<"input">>, [], [] },
  1211. { <<"input">>, [], [ <<"x">> ] }
  1212. ] },
  1213. mochiweb_html:parse(D0)),
  1214. ok.
  1215. parse_charref_test() ->
  1216. %% Normal charref
  1217. D0 = <<"<div>&amp;</div>">>,
  1218. ?assertEqual(
  1219. {<<"div">>, [], [<<"&">>]},
  1220. mochiweb_html:parse(D0)),
  1221. %% Missing semicolon in the middle.
  1222. D1 = <<"<div>&amp &amp;</div>">>,
  1223. ?assertEqual(
  1224. {<<"div">>, [], [<<"& &">>]},
  1225. mochiweb_html:parse(D1)),
  1226. %% Missing semicolon on the last enitity
  1227. D2 = <<"<div>&amp &amp</div>">>,
  1228. ?assertEqual(
  1229. {<<"div">>, [], [<<"& &">>]},
  1230. mochiweb_html:parse(D2)),
  1231. D3 = <<"<div>&amp&amp</div>">>,
  1232. ?assertEqual(
  1233. {<<"div">>, [], [<<"&&">>]},
  1234. mochiweb_html:parse(D3)),
  1235. D4 = <<"<div>&amp</div>">>,
  1236. ?assertEqual(
  1237. {<<"div">>, [], [<<"&">>]},
  1238. mochiweb_html:parse(D4)),
  1239. ok.
  1240. parse_charref_garbage_in_garbage_out_test() ->
  1241. %% faulty charref is left alone
  1242. D1 = <<"<div>&amp. test</div>">>,
  1243. ?assertEqual(
  1244. {<<"div">>, [], [<<"&amp. test">>]},
  1245. mochiweb_html:parse(D1)),
  1246. ok.
  1247. -endif.