/src/mochiweb_html.erl

http://github.com/basho/mochiweb · Erlang · 792 lines · 649 code · 64 blank · 79 comment · 1 complexity · b945da9d1ef5644e84e7bdaf30a7033a MD5 · raw file

  1. %% @author Bob Ippolito <bob@mochimedia.com>
  2. %% @copyright 2007 Mochi Media, Inc.
  3. %%
  4. %% Permission is hereby granted, free of charge, to any person obtaining a
  5. %% copy of this software and associated documentation files (the "Software"),
  6. %% to deal in the Software without restriction, including without limitation
  7. %% the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. %% and/or sell copies of the Software, and to permit persons to whom the
  9. %% Software is furnished to do so, subject to the following conditions:
  10. %%
  11. %% The above copyright notice and this permission notice shall be included in
  12. %% all copies or substantial portions of the Software.
  13. %%
  14. %% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. %% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. %% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  17. %% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. %% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. %% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20. %% DEALINGS IN THE SOFTWARE.
  21. %% @doc Loosely tokenizes and generates parse trees for HTML 4.
  22. -module(mochiweb_html).
  23. -export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1,
  24. escape_attr/1, to_html/1]).
  25. -compile([export_all]).
  26. -ifdef(TEST).
  27. -export([destack/1, destack/2, is_singleton/1]).
  28. -endif.
  29. %% This is a macro to placate syntax highlighters..
  30. -define(QUOTE, $\"). %% $\"
  31. -define(SQUOTE, $\'). %% $\'
  32. -define(ADV_COL(S, N),
  33. S#decoder{column=N+S#decoder.column,
  34. offset=N+S#decoder.offset}).
  35. -define(INC_COL(S),
  36. S#decoder{column=1+S#decoder.column,
  37. offset=1+S#decoder.offset}).
  38. -define(INC_LINE(S),
  39. S#decoder{column=1,
  40. line=1+S#decoder.line,
  41. offset=1+S#decoder.offset}).
  42. -define(INC_CHAR(S, C),
  43. case C of
  44. $\n ->
  45. S#decoder{column=1,
  46. line=1+S#decoder.line,
  47. offset=1+S#decoder.offset};
  48. _ ->
  49. S#decoder{column=1+S#decoder.column,
  50. offset=1+S#decoder.offset}
  51. end).
  52. -define(IS_WHITESPACE(C),
  53. (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
  54. -define(IS_LITERAL_SAFE(C),
  55. ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)
  56. orelse (C >= $0 andalso C =< $9))).
  57. -define(PROBABLE_CLOSE(C),
  58. (C =:= $> orelse ?IS_WHITESPACE(C))).
  59. -record(decoder, {line=1,
  60. column=1,
  61. offset=0}).
  62. %% @type html_node() = {string(), [html_attr()], [html_node() | string()]}
  63. %% @type html_attr() = {string(), string()}
  64. %% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype()
  65. %% @type html_data() = {data, string(), Whitespace::boolean()}
  66. %% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()}
  67. %% @type end_tag() = {end_tag, Name}
  68. %% @type html_comment() = {comment, Comment}
  69. %% @type html_doctype() = {doctype, [Doctype]}
  70. %% @type inline_html() = {'=', iolist()}
  71. %% External API.
  72. %% @spec parse(string() | binary()) -> html_node()
  73. %% @doc tokenize and then transform the token stream into a HTML tree.
  74. parse(Input) ->
  75. parse_tokens(tokens(Input)).
  76. %% @spec parse_tokens([html_token()]) -> html_node()
  77. %% @doc Transform the output of tokens(Doc) into a HTML tree.
  78. parse_tokens(Tokens) when is_list(Tokens) ->
  79. %% Skip over doctype, processing instructions
  80. [{start_tag, Tag, Attrs, false} | Rest] = find_document(Tokens, normal),
  81. {Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
  82. Tree.
  83. find_document(Tokens=[{start_tag, _Tag, _Attrs, false} | _Rest], Mode) ->
  84. maybe_add_html_tag(Tokens, Mode);
  85. find_document([{doctype, [<<"html">>]} | Rest], _Mode) ->
  86. find_document(Rest, html5);
  87. find_document([_T | Rest], Mode) ->
  88. find_document(Rest, Mode);
  89. find_document([], _Mode) ->
  90. [].
  91. maybe_add_html_tag(Tokens=[{start_tag, Tag, _Attrs, false} | _], html5)
  92. when Tag =/= <<"html">> ->
  93. [{start_tag, <<"html">>, [], false} | Tokens];
  94. maybe_add_html_tag(Tokens, _Mode) ->
  95. Tokens.
  96. %% @spec tokens(StringOrBinary) -> [html_token()]
  97. %% @doc Transform the input UTF-8 HTML into a token stream.
  98. tokens(Input) ->
  99. tokens(iolist_to_binary(Input), #decoder{}, []).
  100. %% @spec to_tokens(html_node()) -> [html_token()]
  101. %% @doc Convert a html_node() tree to a list of tokens.
  102. to_tokens({Tag0}) ->
  103. to_tokens({Tag0, [], []});
  104. to_tokens(T={'=', _}) ->
  105. [T];
  106. to_tokens(T={doctype, _}) ->
  107. [T];
  108. to_tokens(T={comment, _}) ->
  109. [T];
  110. to_tokens({Tag0, Acc}) ->
  111. %% This is only allowed in sub-tags: {p, [{"class", "foo"}]}
  112. to_tokens({Tag0, [], Acc});
  113. to_tokens({Tag0, Attrs, Acc}) ->
  114. Tag = to_tag(Tag0),
  115. case is_singleton(Tag) of
  116. true ->
  117. to_tokens([], [{start_tag, Tag, Attrs, true}]);
  118. false ->
  119. to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, false}])
  120. end.
  121. %% @spec to_html([html_token()] | html_node()) -> iolist()
  122. %% @doc Convert a list of html_token() to a HTML document.
  123. to_html(Node) when is_tuple(Node) ->
  124. to_html(to_tokens(Node));
  125. to_html(Tokens) when is_list(Tokens) ->
  126. to_html(Tokens, []).
  127. %% @spec escape(string() | atom() | binary()) -> binary()
  128. %% @doc Escape a string such that it's safe for HTML (amp; lt; gt;).
  129. escape(B) when is_binary(B) ->
  130. escape(binary_to_list(B), []);
  131. escape(A) when is_atom(A) ->
  132. escape(atom_to_list(A), []);
  133. escape(S) when is_list(S) ->
  134. escape(S, []).
  135. %% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary()
  136. %% @doc Escape a string such that it's safe for HTML attrs
  137. %% (amp; lt; gt; quot;).
  138. escape_attr(B) when is_binary(B) ->
  139. escape_attr(binary_to_list(B), []);
  140. escape_attr(A) when is_atom(A) ->
  141. escape_attr(atom_to_list(A), []);
  142. escape_attr(S) when is_list(S) ->
  143. escape_attr(S, []);
  144. escape_attr(I) when is_integer(I) ->
  145. escape_attr(integer_to_list(I), []);
  146. escape_attr(F) when is_float(F) ->
  147. escape_attr(mochinum:digits(F), []).
  148. to_html([], Acc) ->
  149. lists:reverse(Acc);
  150. to_html([{'=', Content} | Rest], Acc) ->
  151. to_html(Rest, [Content | Acc]);
  152. to_html([{pi, Bin} | Rest], Acc) ->
  153. Open = [<<"<?">>,
  154. Bin,
  155. <<"?>">>],
  156. to_html(Rest, [Open | Acc]);
  157. to_html([{pi, Tag, Attrs} | Rest], Acc) ->
  158. Open = [<<"<?">>,
  159. Tag,
  160. attrs_to_html(Attrs, []),
  161. <<"?>">>],
  162. to_html(Rest, [Open | Acc]);
  163. to_html([{comment, Comment} | Rest], Acc) ->
  164. to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] | Acc]);
  165. to_html([{doctype, Parts} | Rest], Acc) ->
  166. Inside = doctype_to_html(Parts, Acc),
  167. to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] | Acc]);
  168. to_html([{data, Data, _Whitespace} | Rest], Acc) ->
  169. to_html(Rest, [escape(Data) | Acc]);
  170. to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc) ->
  171. Open = [<<"<">>,
  172. Tag,
  173. attrs_to_html(Attrs, []),
  174. case Singleton of
  175. true -> <<" />">>;
  176. false -> <<">">>
  177. end],
  178. to_html(Rest, [Open | Acc]);
  179. to_html([{end_tag, Tag} | Rest], Acc) ->
  180. to_html(Rest, [[<<"</">>, Tag, <<">">>] | Acc]).
  181. doctype_to_html([], Acc) ->
  182. lists:reverse(Acc);
  183. doctype_to_html([Word | Rest], Acc) ->
  184. case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end,
  185. binary_to_list(iolist_to_binary(Word))) of
  186. true ->
  187. doctype_to_html(Rest, [[<<" ">>, Word] | Acc]);
  188. false ->
  189. doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc])
  190. end.
  191. attrs_to_html([], Acc) ->
  192. lists:reverse(Acc);
  193. attrs_to_html([{K, V} | Rest], Acc) ->
  194. attrs_to_html(Rest,
  195. [[<<" ">>, escape(K), <<"=\"">>,
  196. escape_attr(V), <<"\"">>] | Acc]).
  197. escape([], Acc) ->
  198. list_to_binary(lists:reverse(Acc));
  199. escape("<" ++ Rest, Acc) ->
  200. escape(Rest, lists:reverse("&lt;", Acc));
  201. escape(">" ++ Rest, Acc) ->
  202. escape(Rest, lists:reverse("&gt;", Acc));
  203. escape("&" ++ Rest, Acc) ->
  204. escape(Rest, lists:reverse("&amp;", Acc));
  205. escape([C | Rest], Acc) ->
  206. escape(Rest, [C | Acc]).
  207. escape_attr([], Acc) ->
  208. list_to_binary(lists:reverse(Acc));
  209. escape_attr("<" ++ Rest, Acc) ->
  210. escape_attr(Rest, lists:reverse("&lt;", Acc));
  211. escape_attr(">" ++ Rest, Acc) ->
  212. escape_attr(Rest, lists:reverse("&gt;", Acc));
  213. escape_attr("&" ++ Rest, Acc) ->
  214. escape_attr(Rest, lists:reverse("&amp;", Acc));
  215. escape_attr([?QUOTE | Rest], Acc) ->
  216. escape_attr(Rest, lists:reverse("&quot;", Acc));
  217. escape_attr([C | Rest], Acc) ->
  218. escape_attr(Rest, [C | Acc]).
  219. to_tag(A) when is_atom(A) ->
  220. norm(atom_to_list(A));
  221. to_tag(L) ->
  222. norm(L).
  223. to_tokens([], Acc) ->
  224. lists:reverse(Acc);
  225. to_tokens([{Tag, []} | Rest], Acc) ->
  226. to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]);
  227. to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) ->
  228. %% Allow {br}
  229. to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc);
  230. to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) ->
  231. %% Allow {'=', iolist()}
  232. to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
  233. to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) ->
  234. %% Allow {comment, iolist()}
  235. to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
  236. to_tokens([{Tag0, [T0={pi, _S0} | R1]} | Rest], Acc) ->
  237. %% Allow {pi, binary()}
  238. to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
  239. to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) ->
  240. %% Allow {pi, binary(), list()}
  241. to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
  242. to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) ->
  243. %% Allow {p, [{"class", "foo"}]}
  244. to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc);
  245. to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) ->
  246. %% Allow {p, "content"} and {p, <<"content">>}
  247. to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc);
  248. to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) ->
  249. %% Allow {"p", [{"class", "foo"}], <<"content">>}
  250. to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc);
  251. to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc)
  252. when is_integer(C) ->
  253. %% Allow {"p", [{"class", "foo"}], "content"}
  254. to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc);
  255. to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) ->
  256. %% Native {"p", [{"class", "foo"}], ["content"]}
  257. Tag = to_tag(Tag0),
  258. T1 = to_tag(T0),
  259. case is_singleton(norm(T1)) of
  260. true ->
  261. to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]);
  262. false ->
  263. to_tokens([{T1, C1}, {Tag, R1} | Rest],
  264. [{start_tag, T1, A1, false} | Acc])
  265. end;
  266. to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) ->
  267. %% List text
  268. Tag = to_tag(Tag0),
  269. to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]);
  270. to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) ->
  271. %% Binary text
  272. Tag = to_tag(Tag0),
  273. to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]).
  274. tokens(B, S=#decoder{offset=O}, Acc) ->
  275. case B of
  276. <<_:O/binary>> ->
  277. lists:reverse(Acc);
  278. _ ->
  279. {Tag, S1} = tokenize(B, S),
  280. case parse_flag(Tag) of
  281. script ->
  282. {Tag2, S2} = tokenize_script(B, S1),
  283. tokens(B, S2, [Tag2, Tag | Acc]);
  284. textarea ->
  285. {Tag2, S2} = tokenize_textarea(B, S1),
  286. tokens(B, S2, [Tag2, Tag | Acc]);
  287. none ->
  288. tokens(B, S1, [Tag | Acc])
  289. end
  290. end.
  291. parse_flag({start_tag, B, _, false}) ->
  292. case string:to_lower(binary_to_list(B)) of
  293. "script" ->
  294. script;
  295. "textarea" ->
  296. textarea;
  297. _ ->
  298. none
  299. end;
  300. parse_flag(_) ->
  301. none.
  302. tokenize(B, S=#decoder{offset=O}) ->
  303. case B of
  304. <<_:O/binary, "<!--", _/binary>> ->
  305. tokenize_comment(B, ?ADV_COL(S, 4));
  306. <<_:O/binary, "<!doctype", _/binary>> ->
  307. tokenize_doctype(B, ?ADV_COL(S, 10));
  308. <<_:O/binary, "<!DOCTYPE", _/binary>> ->
  309. tokenize_doctype(B, ?ADV_COL(S, 10));
  310. <<_:O/binary, "<![CDATA[", _/binary>> ->
  311. tokenize_cdata(B, ?ADV_COL(S, 9));
  312. <<_:O/binary, "<?php", _/binary>> ->
  313. {Body, S1} = raw_qgt(B, ?ADV_COL(S, 2)),
  314. {{pi, Body}, S1};
  315. <<_:O/binary, "<?", _/binary>> ->
  316. {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
  317. {Attrs, S2} = tokenize_attributes(B, S1),
  318. S3 = find_qgt(B, S2),
  319. {{pi, Tag, Attrs}, S3};
  320. <<_:O/binary, "&", _/binary>> ->
  321. tokenize_charref(B, ?INC_COL(S));
  322. <<_:O/binary, "</", _/binary>> ->
  323. {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
  324. {S2, _} = find_gt(B, S1),
  325. {{end_tag, Tag}, S2};
  326. <<_:O/binary, "<", C, _/binary>>
  327. when ?IS_WHITESPACE(C); not ?IS_LITERAL_SAFE(C) ->
  328. %% This isn't really strict HTML
  329. {{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)),
  330. {{data, <<$<, Data/binary>>, false}, S1};
  331. <<_:O/binary, "<", _/binary>> ->
  332. {Tag, S1} = tokenize_literal(B, ?INC_COL(S)),
  333. {Attrs, S2} = tokenize_attributes(B, S1),
  334. {S3, HasSlash} = find_gt(B, S2),
  335. Singleton = HasSlash orelse is_singleton(Tag),
  336. {{start_tag, Tag, Attrs, Singleton}, S3};
  337. _ ->
  338. tokenize_data(B, S)
  339. end.
  340. tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
  341. tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]);
  342. tree_data(Rest, AllWhitespace, Acc) ->
  343. {iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.
  344. tree([], Stack) ->
  345. {destack(Stack), []};
  346. tree([{end_tag, Tag} | Rest], Stack) ->
  347. case destack(norm(Tag), Stack) of
  348. S when is_list(S) ->
  349. tree(Rest, S);
  350. Result ->
  351. {Result, []}
  352. end;
  353. tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
  354. tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
  355. tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
  356. tree(Rest, stack(norm({Tag, Attrs}), S));
  357. tree([T={pi, _Raw} | Rest], S) ->
  358. tree(Rest, append_stack_child(T, S));
  359. tree([T={pi, _Tag, _Attrs} | Rest], S) ->
  360. tree(Rest, append_stack_child(T, S));
  361. tree([T={comment, _Comment} | Rest], S) ->
  362. tree(Rest, append_stack_child(T, S));
  363. tree(L=[{data, _Data, _Whitespace} | _], S) ->
  364. case tree_data(L, true, []) of
  365. {_, true, Rest} ->
  366. tree(Rest, S);
  367. {Data, false, Rest} ->
  368. tree(Rest, append_stack_child(Data, S))
  369. end;
  370. tree([{doctype, _} | Rest], Stack) ->
  371. tree(Rest, Stack).
  372. norm({Tag, Attrs}) ->
  373. {norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
  374. norm(Tag) when is_binary(Tag) ->
  375. Tag;
  376. norm(Tag) ->
  377. list_to_binary(string:to_lower(Tag)).
  378. stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest])
  379. when TN =:= <<"li">> orelse TN =:= <<"option">> ->
  380. [T1 | destack(TN, Stack)];
  381. stack(T1={TN0, _, _}, Stack=[{TN1, _, _} | _Rest])
  382. when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso
  383. (TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) ->
  384. [T1 | destack(TN1, Stack)];
  385. stack(T1, Stack) ->
  386. [T1 | Stack].
  387. append_stack_child(StartTag, [{Name, Attrs, Acc} | Stack]) ->
  388. [{Name, Attrs, [StartTag | Acc]} | Stack].
  389. destack(<<"br">>, Stack) ->
  390. %% This is an ugly hack to make dumb_br_test() pass,
  391. %% this makes it such that br can never have children.
  392. Stack;
  393. destack(TagName, Stack) when is_list(Stack) ->
  394. F = fun (X) ->
  395. case X of
  396. {TagName, _, _} ->
  397. false;
  398. _ ->
  399. true
  400. end
  401. end,
  402. case lists:splitwith(F, Stack) of
  403. {_, []} ->
  404. %% If we're parsing something like XML we might find
  405. %% a <link>tag</link> that is normally a singleton
  406. %% in HTML but isn't here
  407. case {is_singleton(TagName), Stack} of
  408. {true, [{T0, A0, Acc0} | Post0]} ->
  409. case lists:splitwith(F, Acc0) of
  410. {_, []} ->
  411. %% Actually was a singleton
  412. Stack;
  413. {Pre, [{T1, A1, Acc1} | Post1]} ->
  414. [{T0, A0, [{T1, A1, Acc1 ++ lists:reverse(Pre)} | Post1]}
  415. | Post0]
  416. end;
  417. _ ->
  418. %% No match, no state change
  419. Stack
  420. end;
  421. {_Pre, [_T]} ->
  422. %% Unfurl the whole stack, we're done
  423. destack(Stack);
  424. {Pre, [T, {T0, A0, Acc0} | Post]} ->
  425. %% Unfurl up to the tag, then accumulate it
  426. [{T0, A0, [destack(Pre ++ [T]) | Acc0]} | Post]
  427. end.
  428. destack([{Tag, Attrs, Acc}]) ->
  429. {Tag, Attrs, lists:reverse(Acc)};
  430. destack([{T1, A1, Acc1}, {T0, A0, Acc0} | Rest]) ->
  431. destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} | Acc0]} | Rest]).
  432. is_singleton(<<"br">>) -> true;
  433. is_singleton(<<"hr">>) -> true;
  434. is_singleton(<<"img">>) -> true;
  435. is_singleton(<<"input">>) -> true;
  436. is_singleton(<<"base">>) -> true;
  437. is_singleton(<<"meta">>) -> true;
  438. is_singleton(<<"link">>) -> true;
  439. is_singleton(<<"area">>) -> true;
  440. is_singleton(<<"param">>) -> true;
  441. is_singleton(<<"col">>) -> true;
  442. is_singleton(_) -> false.
  443. tokenize_data(B, S=#decoder{offset=O}) ->
  444. tokenize_data(B, S, O, true).
  445. tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) ->
  446. case B of
  447. <<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) ->
  448. tokenize_data(B, ?INC_CHAR(S, C), Start,
  449. (Whitespace andalso ?IS_WHITESPACE(C)));
  450. _ ->
  451. Len = O - Start,
  452. <<_:Start/binary, Data:Len/binary, _/binary>> = B,
  453. {{data, Data, Whitespace}, S}
  454. end.
  455. tokenize_attributes(B, S) ->
  456. tokenize_attributes(B, S, []).
  457. tokenize_attributes(B, S=#decoder{offset=O}, Acc) ->
  458. case B of
  459. <<_:O/binary>> ->
  460. {lists:reverse(Acc), S};
  461. <<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) ->
  462. {lists:reverse(Acc), S};
  463. <<_:O/binary, "?>", _/binary>> ->
  464. {lists:reverse(Acc), S};
  465. <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
  466. tokenize_attributes(B, ?INC_CHAR(S, C), Acc);
  467. _ ->
  468. {Attr, S1} = tokenize_literal(B, S),
  469. {Value, S2} = tokenize_attr_value(Attr, B, S1),
  470. tokenize_attributes(B, S2, [{Attr, Value} | Acc])
  471. end.
  472. tokenize_attr_value(Attr, B, S) ->
  473. S1 = skip_whitespace(B, S),
  474. O = S1#decoder.offset,
  475. case B of
  476. <<_:O/binary, "=", _/binary>> ->
  477. S2 = skip_whitespace(B, ?INC_COL(S1)),
  478. tokenize_quoted_or_unquoted_attr_value(B, S2);
  479. _ ->
  480. {Attr, S1}
  481. end.
  482. tokenize_quoted_or_unquoted_attr_value(B, S=#decoder{offset=O}) ->
  483. case B of
  484. <<_:O/binary>> ->
  485. { [], S };
  486. <<_:O/binary, Q, _/binary>> when Q =:= ?QUOTE orelse
  487. Q =:= ?SQUOTE ->
  488. tokenize_quoted_attr_value(B, ?INC_COL(S), [], Q);
  489. <<_:O/binary, _/binary>> ->
  490. tokenize_unquoted_attr_value(B, S, [])
  491. end.
  492. tokenize_quoted_attr_value(B, S=#decoder{offset=O}, Acc, Q) ->
  493. case B of
  494. <<_:O/binary>> ->
  495. { iolist_to_binary(lists:reverse(Acc)), S };
  496. <<_:O/binary, $&, _/binary>> ->
  497. {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
  498. tokenize_quoted_attr_value(B, S1, [Data|Acc], Q);
  499. <<_:O/binary, Q, _/binary>> ->
  500. { iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S) };
  501. <<_:O/binary, C, _/binary>> ->
  502. tokenize_quoted_attr_value(B, ?INC_COL(S), [C|Acc], Q)
  503. end.
  504. tokenize_unquoted_attr_value(B, S=#decoder{offset=O}, Acc) ->
  505. case B of
  506. <<_:O/binary>> ->
  507. { iolist_to_binary(lists:reverse(Acc)), S };
  508. <<_:O/binary, $&, _/binary>> ->
  509. {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
  510. tokenize_unquoted_attr_value(B, S1, [Data|Acc]);
  511. <<_:O/binary, $/, $>, _/binary>> ->
  512. { iolist_to_binary(lists:reverse(Acc)), S };
  513. <<_:O/binary, C, _/binary>> when ?PROBABLE_CLOSE(C) ->
  514. { iolist_to_binary(lists:reverse(Acc)), S };
  515. <<_:O/binary, C, _/binary>> ->
  516. tokenize_unquoted_attr_value(B, ?INC_COL(S), [C|Acc])
  517. end.
  518. skip_whitespace(B, S=#decoder{offset=O}) ->
  519. case B of
  520. <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
  521. skip_whitespace(B, ?INC_CHAR(S, C));
  522. _ ->
  523. S
  524. end.
  525. tokenize_literal(Bin, S=#decoder{offset=O}) ->
  526. case Bin of
  527. <<_:O/binary, C, _/binary>> when C =:= $>
  528. orelse C =:= $/
  529. orelse C =:= $= ->
  530. %% Handle case where tokenize_literal would consume
  531. %% 0 chars. http://github.com/mochi/mochiweb/pull/13
  532. {[C], ?INC_COL(S)};
  533. _ ->
  534. tokenize_literal(Bin, S, [])
  535. end.
  536. tokenize_literal(Bin, S=#decoder{offset=O}, Acc) ->
  537. case Bin of
  538. <<_:O/binary, $&, _/binary>> ->
  539. {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
  540. tokenize_literal(Bin, S1, [Data | Acc]);
  541. <<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C)
  542. orelse C =:= $>
  543. orelse C =:= $/
  544. orelse C =:= $=) ->
  545. tokenize_literal(Bin, ?INC_COL(S), [C | Acc]);
  546. _ ->
  547. {iolist_to_binary(string:to_lower(lists:reverse(Acc))), S}
  548. end.
  549. raw_qgt(Bin, S=#decoder{offset=O}) ->
  550. raw_qgt(Bin, S, O).
  551. raw_qgt(Bin, S=#decoder{offset=O}, Start) ->
  552. case Bin of
  553. <<_:O/binary, "?>", _/binary>> ->
  554. Len = O - Start,
  555. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  556. {Raw, ?ADV_COL(S, 2)};
  557. <<_:O/binary, C, _/binary>> ->
  558. raw_qgt(Bin, ?INC_CHAR(S, C), Start);
  559. <<_:O/binary>> ->
  560. <<_:Start/binary, Raw/binary>> = Bin,
  561. {Raw, S}
  562. end.
  563. find_qgt(Bin, S=#decoder{offset=O}) ->
  564. case Bin of
  565. <<_:O/binary, "?>", _/binary>> ->
  566. ?ADV_COL(S, 2);
  567. <<_:O/binary, ">", _/binary>> ->
  568. ?ADV_COL(S, 1);
  569. <<_:O/binary, "/>", _/binary>> ->
  570. ?ADV_COL(S, 2);
  571. %% tokenize_attributes takes care of this state:
  572. %% <<_:O/binary, C, _/binary>> ->
  573. %% find_qgt(Bin, ?INC_CHAR(S, C));
  574. <<_:O/binary>> ->
  575. S
  576. end.
  577. find_gt(Bin, S) ->
  578. find_gt(Bin, S, false).
  579. find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->
  580. case Bin of
  581. <<_:O/binary, $/, _/binary>> ->
  582. find_gt(Bin, ?INC_COL(S), true);
  583. <<_:O/binary, $>, _/binary>> ->
  584. {?INC_COL(S), HasSlash};
  585. <<_:O/binary, C, _/binary>> ->
  586. find_gt(Bin, ?INC_CHAR(S, C), HasSlash);
  587. _ ->
  588. {S, HasSlash}
  589. end.
  590. tokenize_charref(Bin, S=#decoder{offset=O}) ->
  591. try
  592. tokenize_charref(Bin, S, O)
  593. catch
  594. throw:invalid_charref ->
  595. {{data, <<"&">>, false}, S}
  596. end.
  597. tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
  598. case Bin of
  599. <<_:O/binary>> ->
  600. throw(invalid_charref);
  601. <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C)
  602. orelse C =:= ?SQUOTE
  603. orelse C =:= ?QUOTE
  604. orelse C =:= $/
  605. orelse C =:= $> ->
  606. throw(invalid_charref);
  607. <<_:O/binary, $;, _/binary>> ->
  608. Len = O - Start,
  609. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  610. Data = case mochiweb_charref:charref(Raw) of
  611. undefined ->
  612. throw(invalid_charref);
  613. Unichar when is_integer(Unichar) ->
  614. mochiutf8:codepoint_to_bytes(Unichar);
  615. Unichars when is_list(Unichars) ->
  616. unicode:characters_to_binary(Unichars)
  617. end,
  618. {{data, Data, false}, ?INC_COL(S)};
  619. _ ->
  620. tokenize_charref(Bin, ?INC_COL(S), Start)
  621. end.
  622. tokenize_doctype(Bin, S) ->
  623. tokenize_doctype(Bin, S, []).
  624. tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) ->
  625. case Bin of
  626. <<_:O/binary>> ->
  627. {{doctype, lists:reverse(Acc)}, S};
  628. <<_:O/binary, $>, _/binary>> ->
  629. {{doctype, lists:reverse(Acc)}, ?INC_COL(S)};
  630. <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
  631. tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc);
  632. _ ->
  633. {Word, S1} = tokenize_word_or_literal(Bin, S),
  634. tokenize_doctype(Bin, S1, [Word | Acc])
  635. end.
  636. tokenize_word_or_literal(Bin, S=#decoder{offset=O}) ->
  637. case Bin of
  638. <<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE ->
  639. tokenize_word(Bin, ?INC_COL(S), C);
  640. <<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) ->
  641. %% Sanity check for whitespace
  642. tokenize_literal(Bin, S)
  643. end.
  644. tokenize_word(Bin, S, Quote) ->
  645. tokenize_word(Bin, S, Quote, []).
  646. tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) ->
  647. case Bin of
  648. <<_:O/binary>> ->
  649. {iolist_to_binary(lists:reverse(Acc)), S};
  650. <<_:O/binary, Quote, _/binary>> ->
  651. {iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)};
  652. <<_:O/binary, $&, _/binary>> ->
  653. {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
  654. tokenize_word(Bin, S1, Quote, [Data | Acc]);
  655. <<_:O/binary, C, _/binary>> ->
  656. tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C | Acc])
  657. end.
  658. tokenize_cdata(Bin, S=#decoder{offset=O}) ->
  659. tokenize_cdata(Bin, S, O).
  660. tokenize_cdata(Bin, S=#decoder{offset=O}, Start) ->
  661. case Bin of
  662. <<_:O/binary, "]]>", _/binary>> ->
  663. Len = O - Start,
  664. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  665. {{data, Raw, false}, ?ADV_COL(S, 3)};
  666. <<_:O/binary, C, _/binary>> ->
  667. tokenize_cdata(Bin, ?INC_CHAR(S, C), Start);
  668. _ ->
  669. <<_:O/binary, Raw/binary>> = Bin,
  670. {{data, Raw, false}, S}
  671. end.
  672. tokenize_comment(Bin, S=#decoder{offset=O}) ->
  673. tokenize_comment(Bin, S, O).
  674. tokenize_comment(Bin, S=#decoder{offset=O}, Start) ->
  675. case Bin of
  676. <<_:O/binary, "-->", _/binary>> ->
  677. Len = O - Start,
  678. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  679. {{comment, Raw}, ?ADV_COL(S, 3)};
  680. <<_:O/binary, C, _/binary>> ->
  681. tokenize_comment(Bin, ?INC_CHAR(S, C), Start);
  682. <<_:Start/binary, Raw/binary>> ->
  683. {{comment, Raw}, S}
  684. end.
  685. tokenize_script(Bin, S=#decoder{offset=O}) ->
  686. tokenize_script(Bin, S, O).
  687. tokenize_script(Bin, S=#decoder{offset=O}, Start) ->
  688. case Bin of
  689. %% Just a look-ahead, we want the end_tag separately
  690. <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>>
  691. when (SS =:= $s orelse SS =:= $S) andalso
  692. (CC =:= $c orelse CC =:= $C) andalso
  693. (RR =:= $r orelse RR =:= $R) andalso
  694. (II =:= $i orelse II =:= $I) andalso
  695. (PP =:= $p orelse PP =:= $P) andalso
  696. (TT=:= $t orelse TT =:= $T) andalso
  697. ?PROBABLE_CLOSE(ZZ) ->
  698. Len = O - Start,
  699. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  700. {{data, Raw, false}, S};
  701. <<_:O/binary, C, _/binary>> ->
  702. tokenize_script(Bin, ?INC_CHAR(S, C), Start);
  703. <<_:Start/binary, Raw/binary>> ->
  704. {{data, Raw, false}, S}
  705. end.
  706. tokenize_textarea(Bin, S=#decoder{offset=O}) ->
  707. tokenize_textarea(Bin, S, O).
  708. tokenize_textarea(Bin, S=#decoder{offset=O}, Start) ->
  709. case Bin of
  710. %% Just a look-ahead, we want the end_tag separately
  711. <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>>
  712. when (TT =:= $t orelse TT =:= $T) andalso
  713. (EE =:= $e orelse EE =:= $E) andalso
  714. (XX =:= $x orelse XX =:= $X) andalso
  715. (TT2 =:= $t orelse TT2 =:= $T) andalso
  716. (AA =:= $a orelse AA =:= $A) andalso
  717. (RR =:= $r orelse RR =:= $R) andalso
  718. (EE2 =:= $e orelse EE2 =:= $E) andalso
  719. (AA2 =:= $a orelse AA2 =:= $A) andalso
  720. ?PROBABLE_CLOSE(ZZ) ->
  721. Len = O - Start,
  722. <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
  723. {{data, Raw, false}, S};
  724. <<_:O/binary, C, _/binary>> ->
  725. tokenize_textarea(Bin, ?INC_CHAR(S, C), Start);
  726. <<_:Start/binary, Raw/binary>> ->
  727. {{data, Raw, false}, S}
  728. end.