/deps/mochiweb/src/mochiweb_html.erl
Erlang | 1352 lines | 1169 code | 105 blank | 78 comment | 2 complexity | 15edba7ca92133861eba435943f0e97f MD5 | raw file
Possible License(s): Apache-2.0, MIT, LGPL-2.1, BSD-3-Clause
- %% @author Bob Ippolito <bob@mochimedia.com>
- %% @copyright 2007 Mochi Media, Inc.
- %% @doc Loosely tokenizes and generates parse trees for HTML 4.
- -module(mochiweb_html).
- -export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1,
- escape_attr/1, to_html/1]).
- %% This is a macro to placate syntax highlighters..
- -define(QUOTE, $\").
- -define(SQUOTE, $\').
- -define(ADV_COL(S, N),
- S#decoder{column=N+S#decoder.column,
- offset=N+S#decoder.offset}).
- -define(INC_COL(S),
- S#decoder{column=1+S#decoder.column,
- offset=1+S#decoder.offset}).
- -define(INC_LINE(S),
- S#decoder{column=1,
- line=1+S#decoder.line,
- offset=1+S#decoder.offset}).
- -define(INC_CHAR(S, C),
- case C of
- $\n ->
- S#decoder{column=1,
- line=1+S#decoder.line,
- offset=1+S#decoder.offset};
- _ ->
- S#decoder{column=1+S#decoder.column,
- offset=1+S#decoder.offset}
- end).
- -define(IS_WHITESPACE(C),
- (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
- -define(IS_LITERAL_SAFE(C),
- ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)
- orelse (C >= $0 andalso C =< $9))).
- -define(PROBABLE_CLOSE(C),
- (C =:= $> orelse ?IS_WHITESPACE(C))).
- -record(decoder, {line=1,
- column=1,
- offset=0}).
- %% @type html_node() = {string(), [html_attr()], [html_node() | string()]}
- %% @type html_attr() = {string(), string()}
- %% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype()
- %% @type html_data() = {data, string(), Whitespace::boolean()}
- %% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()}
- %% @type end_tag() = {end_tag, Name}
- %% @type html_comment() = {comment, Comment}
- %% @type html_doctype() = {doctype, [Doctype]}
- %% @type inline_html() = {'=', iolist()}
- %% External API.
- %% @spec parse(string() | binary()) -> html_node()
- %% @doc tokenize and then transform the token stream into a HTML tree.
- parse(Input) ->
- parse_tokens(tokens(Input)).
- %% @spec parse_tokens([html_token()]) -> html_node()
- %% @doc Transform the output of tokens(Doc) into a HTML tree.
- parse_tokens(Tokens) when is_list(Tokens) ->
- %% Skip over doctype, processing instructions
- F = fun (X) ->
- case X of
- {start_tag, _, _, false} ->
- false;
- _ ->
- true
- end
- end,
- [{start_tag, Tag, Attrs, false} | Rest] = lists:dropwhile(F, Tokens),
- {Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
- Tree.
- %% @spec tokens(StringOrBinary) -> [html_token()]
- %% @doc Transform the input UTF-8 HTML into a token stream.
- tokens(Input) ->
- tokens(iolist_to_binary(Input), #decoder{}, []).
- %% @spec to_tokens(html_node()) -> [html_token()]
- %% @doc Convert a html_node() tree to a list of tokens.
- to_tokens({Tag0}) ->
- to_tokens({Tag0, [], []});
- to_tokens(T={'=', _}) ->
- [T];
- to_tokens(T={doctype, _}) ->
- [T];
- to_tokens(T={comment, _}) ->
- [T];
- to_tokens({Tag0, Acc}) ->
- %% This is only allowed in sub-tags: {p, [{"class", "foo"}]}
- to_tokens({Tag0, [], Acc});
- to_tokens({Tag0, Attrs, Acc}) ->
- Tag = to_tag(Tag0),
- to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, is_singleton(Tag)}]).
- %% @spec to_html([html_token()] | html_node()) -> iolist()
- %% @doc Convert a list of html_token() to a HTML document.
- to_html(Node) when is_tuple(Node) ->
- to_html(to_tokens(Node));
- to_html(Tokens) when is_list(Tokens) ->
- to_html(Tokens, []).
- %% @spec escape(string() | atom() | binary()) -> binary()
- %% @doc Escape a string such that it's safe for HTML (amp; lt; gt;).
- escape(B) when is_binary(B) ->
- escape(binary_to_list(B), []);
- escape(A) when is_atom(A) ->
- escape(atom_to_list(A), []);
- escape(S) when is_list(S) ->
- escape(S, []).
- %% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary()
- %% @doc Escape a string such that it's safe for HTML attrs
- %% (amp; lt; gt; quot;).
- escape_attr(B) when is_binary(B) ->
- escape_attr(binary_to_list(B), []);
- escape_attr(A) when is_atom(A) ->
- escape_attr(atom_to_list(A), []);
- escape_attr(S) when is_list(S) ->
- escape_attr(S, []);
- escape_attr(I) when is_integer(I) ->
- escape_attr(integer_to_list(I), []);
- escape_attr(F) when is_float(F) ->
- escape_attr(mochinum:digits(F), []).
- to_html(Tree, Acc) ->
- to_html(Tree, Acc, true).
- to_html([], Acc, _Escape) ->
- lists:reverse(Acc);
- to_html([{'=', Content} | Rest], Acc, Escape) ->
- to_html(Rest, [Content | Acc], Escape);
- to_html([{pi, Bin} | Rest], Acc, Escape) ->
- Open = [<<"<?">>,
- Bin,
- <<"?>">>],
- to_html(Rest, [Open | Acc], Escape);
- to_html([{pi, Tag, Attrs} | Rest], Acc, Escape) ->
- Open = [<<"<?">>,
- Tag,
- attrs_to_html(Attrs, []),
- <<"?>">>],
- to_html(Rest, [Open | Acc], Escape);
- to_html([{comment, Comment} | Rest], Acc, Escape) ->
- to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] | Acc], Escape);
- to_html([{doctype, Parts} | Rest], Acc, Escape) ->
- Inside = doctype_to_html(Parts, Acc),
- to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] | Acc], Escape);
- to_html([{data, Data, _Whitespace} | Rest], Acc, true) ->
- to_html(Rest, [escape(Data) | Acc], true);
- to_html([{data, Data, _Whitespace} | Rest], Acc, false) ->
- to_html(Rest, [Data | Acc], false);
- to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc, _Escape) ->
- EscapeData = case Tag of
- <<"script">> -> false;
- _ -> true
- end,
- Open = [<<"<">>,
- Tag,
- attrs_to_html(Attrs, []),
- case Singleton of
- true -> <<" />">>;
- false -> <<">">>
- end],
- to_html(Rest, [Open | Acc], EscapeData);
- to_html([{end_tag, Tag} | Rest], Acc, _Escape) ->
- to_html(Rest, [[<<"</">>, Tag, <<">">>] | Acc], false).
- doctype_to_html([], Acc) ->
- lists:reverse(Acc);
- doctype_to_html([Word | Rest], Acc) ->
- case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end,
- binary_to_list(iolist_to_binary(Word))) of
- true ->
- doctype_to_html(Rest, [[<<" ">>, Word] | Acc]);
- false ->
- doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc])
- end.
- attrs_to_html([], Acc) ->
- lists:reverse(Acc);
- attrs_to_html([{K, V} | Rest], Acc) ->
- attrs_to_html(Rest,
- [[<<" ">>, escape(K), <<"=\"">>,
- escape_attr(V), <<"\"">>] | Acc]).
- escape([], Acc) ->
- list_to_binary(lists:reverse(Acc));
- escape("<" ++ Rest, Acc) ->
- escape(Rest, lists:reverse("<", Acc));
- escape(">" ++ Rest, Acc) ->
- escape(Rest, lists:reverse(">", Acc));
- escape("&" ++ Rest, Acc) ->
- escape(Rest, lists:reverse("&", Acc));
- escape([C | Rest], Acc) ->
- escape(Rest, [C | Acc]).
- escape_attr([], Acc) ->
- list_to_binary(lists:reverse(Acc));
- escape_attr("<" ++ Rest, Acc) ->
- escape_attr(Rest, lists:reverse("<", Acc));
- escape_attr(">" ++ Rest, Acc) ->
- escape_attr(Rest, lists:reverse(">", Acc));
- escape_attr("&" ++ Rest, Acc) ->
- escape_attr(Rest, lists:reverse("&", Acc));
- escape_attr([?QUOTE | Rest], Acc) ->
- escape_attr(Rest, lists:reverse(""", Acc));
- escape_attr([C | Rest], Acc) ->
- escape_attr(Rest, [C | Acc]).
- to_tag(A) when is_atom(A) ->
- norm(atom_to_list(A));
- to_tag(L) ->
- norm(L).
- to_tokens([], Acc) ->
- lists:reverse(Acc);
- to_tokens([{Tag, []} | Rest], Acc) ->
- to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]);
- to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) ->
- %% Allow {br}
- to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc);
- to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) ->
- %% Allow {'=', iolist()}
- to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
- to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) ->
- %% Allow {comment, iolist()}
- to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
- to_tokens([{Tag0, [T0={pi, _S0} | R1]} | Rest], Acc) ->
- %% Allow {pi, binary()}
- to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
- to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) ->
- %% Allow {pi, binary(), list()}
- to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
- to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) ->
- %% Allow {p, [{"class", "foo"}]}
- to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc);
- to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) ->
- %% Allow {p, "content"} and {p, <<"content">>}
- to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc);
- to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) ->
- %% Allow {"p", [{"class", "foo"}], <<"content">>}
- to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc);
- to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc)
- when is_integer(C) ->
- %% Allow {"p", [{"class", "foo"}], "content"}
- to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc);
- to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) ->
- %% Native {"p", [{"class", "foo"}], ["content"]}
- Tag = to_tag(Tag0),
- T1 = to_tag(T0),
- case is_singleton(norm(T1)) of
- true ->
- to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]);
- false ->
- to_tokens([{T1, C1}, {Tag, R1} | Rest],
- [{start_tag, T1, A1, false} | Acc])
- end;
- to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) ->
- %% List text
- Tag = to_tag(Tag0),
- to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]);
- to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) ->
- %% Binary text
- Tag = to_tag(Tag0),
- to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]).
- tokens(B, S=#decoder{offset=O}, Acc) ->
- case B of
- <<_:O/binary>> ->
- lists:reverse(Acc);
- _ ->
- {Tag, S1} = tokenize(B, S),
- case parse_flag(Tag) of
- script ->
- {Tag2, S2} = tokenize_script(B, S1),
- tokens(B, S2, [Tag2, Tag | Acc]);
- textarea ->
- {Tag2, S2} = tokenize_textarea(B, S1),
- tokens(B, S2, [Tag2, Tag | Acc]);
- none ->
- tokens(B, S1, [Tag | Acc])
- end
- end.
- parse_flag({start_tag, B, _, false}) ->
- case string:to_lower(binary_to_list(B)) of
- "script" ->
- script;
- "textarea" ->
- textarea;
- _ ->
- none
- end;
- parse_flag(_) ->
- none.
- tokenize(B, S=#decoder{offset=O}) ->
- case B of
- <<_:O/binary, "<!--", _/binary>> ->
- tokenize_comment(B, ?ADV_COL(S, 4));
- <<_:O/binary, "<!DOCTYPE", _/binary>> ->
- tokenize_doctype(B, ?ADV_COL(S, 10));
- <<_:O/binary, "<![CDATA[", _/binary>> ->
- tokenize_cdata(B, ?ADV_COL(S, 9));
- <<_:O/binary, "<?php", _/binary>> ->
- {Body, S1} = raw_qgt(B, ?ADV_COL(S, 2)),
- {{pi, Body}, S1};
- <<_:O/binary, "<?", _/binary>> ->
- {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
- {Attrs, S2} = tokenize_attributes(B, S1),
- S3 = find_qgt(B, S2),
- {{pi, Tag, Attrs}, S3};
- <<_:O/binary, "&", _/binary>> ->
- tokenize_charref(B, ?INC_COL(S));
- <<_:O/binary, "</", _/binary>> ->
- {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
- {S2, _} = find_gt(B, S1),
- {{end_tag, Tag}, S2};
- <<_:O/binary, "<", C, _/binary>> when ?IS_WHITESPACE(C) ->
- %% This isn't really strict HTML
- {{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)),
- {{data, <<$<, Data/binary>>, false}, S1};
- <<_:O/binary, "<", _/binary>> ->
- {Tag, S1} = tokenize_literal(B, ?INC_COL(S)),
- {Attrs, S2} = tokenize_attributes(B, S1),
- {S3, HasSlash} = find_gt(B, S2),
- Singleton = HasSlash orelse is_singleton(Tag),
- {{start_tag, Tag, Attrs, Singleton}, S3};
- _ ->
- tokenize_data(B, S)
- end.
- tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
- tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]);
- tree_data(Rest, AllWhitespace, Acc) ->
- {iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.
- tree([], Stack) ->
- {destack(Stack), []};
- tree([{end_tag, Tag} | Rest], Stack) ->
- case destack(norm(Tag), Stack) of
- S when is_list(S) ->
- tree(Rest, S);
- Result ->
- {Result, []}
- end;
- tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
- tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
- tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
- tree(Rest, stack(norm({Tag, Attrs}), S));
- tree([T={pi, _Raw} | Rest], S) ->
- tree(Rest, append_stack_child(T, S));
- tree([T={pi, _Tag, _Attrs} | Rest], S) ->
- tree(Rest, append_stack_child(T, S));
- tree([T={comment, _Comment} | Rest], S) ->
- tree(Rest, append_stack_child(T, S));
- tree(L=[{data, _Data, _Whitespace} | _], S) ->
- case tree_data(L, true, []) of
- {_, true, Rest} ->
- tree(Rest, S);
- {Data, false, Rest} ->
- tree(Rest, append_stack_child(Data, S))
- end;
- tree([{doctype, _} | Rest], Stack) ->
- tree(Rest, Stack).
- norm({Tag, Attrs}) ->
- {norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
- norm(Tag) when is_binary(Tag) ->
- Tag;
- norm(Tag) ->
- list_to_binary(string:to_lower(Tag)).
- stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest])
- when TN =:= <<"li">> orelse TN =:= <<"option">> ->
- [T1 | destack(TN, Stack)];
- stack(T1={TN0, _, _}, Stack=[{TN1, _, _} | _Rest])
- when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso
- (TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) ->
- [T1 | destack(TN1, Stack)];
- stack(T1, Stack) ->
- [T1 | Stack].
- append_stack_child(StartTag, [{Name, Attrs, Acc} | Stack]) ->
- [{Name, Attrs, [StartTag | Acc]} | Stack].
- destack(<<"br">>, Stack) ->
- %% This is an ugly hack to make dumb_br_test() pass,
- %% this makes it such that br can never have children.
- Stack;
- destack(TagName, Stack) when is_list(Stack) ->
- F = fun (X) ->
- case X of
- {TagName, _, _} ->
- false;
- _ ->
- true
- end
- end,
- case lists:splitwith(F, Stack) of
- {_, []} ->
- %% If we're parsing something like XML we might find
- %% a <link>tag</link> that is normally a singleton
- %% in HTML but isn't here
- case {is_singleton(TagName), Stack} of
- {true, [{T0, A0, Acc0} | Post0]} ->
- case lists:splitwith(F, Acc0) of
- {_, []} ->
- %% Actually was a singleton
- Stack;
- {Pre, [{T1, A1, Acc1} | Post1]} ->
- [{T0, A0, [{T1, A1, Acc1 ++ lists:reverse(Pre)} | Post1]}
- | Post0]
- end;
- _ ->
- %% No match, no state change
- Stack
- end;
- {_Pre, [_T]} ->
- %% Unfurl the whole stack, we're done
- destack(Stack);
- {Pre, [T, {T0, A0, Acc0} | Post]} ->
- %% Unfurl up to the tag, then accumulate it
- [{T0, A0, [destack(Pre ++ [T]) | Acc0]} | Post]
- end.
- destack([{Tag, Attrs, Acc}]) ->
- {Tag, Attrs, lists:reverse(Acc)};
- destack([{T1, A1, Acc1}, {T0, A0, Acc0} | Rest]) ->
- destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} | Acc0]} | Rest]).
- is_singleton(<<"br">>) -> true;
- is_singleton(<<"hr">>) -> true;
- is_singleton(<<"img">>) -> true;
- is_singleton(<<"input">>) -> true;
- is_singleton(<<"base">>) -> true;
- is_singleton(<<"meta">>) -> true;
- is_singleton(<<"link">>) -> true;
- is_singleton(<<"area">>) -> true;
- is_singleton(<<"param">>) -> true;
- is_singleton(<<"col">>) -> true;
- is_singleton(_) -> false.
- tokenize_data(B, S=#decoder{offset=O}) ->
- tokenize_data(B, S, O, true).
- tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) ->
- case B of
- <<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) ->
- tokenize_data(B, ?INC_CHAR(S, C), Start,
- (Whitespace andalso ?IS_WHITESPACE(C)));
- _ ->
- Len = O - Start,
- <<_:Start/binary, Data:Len/binary, _/binary>> = B,
- {{data, Data, Whitespace}, S}
- end.
- tokenize_attributes(B, S) ->
- tokenize_attributes(B, S, []).
- tokenize_attributes(B, S=#decoder{offset=O}, Acc) ->
- case B of
- <<_:O/binary>> ->
- {lists:reverse(Acc), S};
- <<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) ->
- {lists:reverse(Acc), S};
- <<_:O/binary, "?>", _/binary>> ->
- {lists:reverse(Acc), S};
- <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
- tokenize_attributes(B, ?INC_CHAR(S, C), Acc);
- _ ->
- {Attr, S1} = tokenize_literal(B, S),
- {Value, S2} = tokenize_attr_value(Attr, B, S1),
- tokenize_attributes(B, S2, [{Attr, Value} | Acc])
- end.
- tokenize_attr_value(Attr, B, S) ->
- S1 = skip_whitespace(B, S),
- O = S1#decoder.offset,
- case B of
- <<_:O/binary, "=", _/binary>> ->
- S2 = skip_whitespace(B, ?INC_COL(S1)),
- tokenize_quoted_or_unquoted_attr_value(B, S2);
- _ ->
- {Attr, S1}
- end.
-
- tokenize_quoted_or_unquoted_attr_value(B, S=#decoder{offset=O}) ->
- case B of
- <<_:O/binary>> ->
- { [], S };
- <<_:O/binary, Q, _/binary>> when Q =:= ?QUOTE orelse
- Q =:= ?SQUOTE ->
- tokenize_quoted_attr_value(B, ?INC_COL(S), [], Q);
- <<_:O/binary, _/binary>> ->
- tokenize_unquoted_attr_value(B, S, [])
- end.
-
- tokenize_quoted_attr_value(B, S=#decoder{offset=O}, Acc, Q) ->
- case B of
- <<_:O/binary>> ->
- { iolist_to_binary(lists:reverse(Acc)), S };
- <<_:O/binary, $&, _/binary>> ->
- {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
- tokenize_quoted_attr_value(B, S1, [Data|Acc], Q);
- <<_:O/binary, Q, _/binary>> ->
- { iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S) };
- <<_:O/binary, $\n, _/binary>> ->
- { iolist_to_binary(lists:reverse(Acc)), ?INC_LINE(S) };
- <<_:O/binary, C, _/binary>> ->
- tokenize_quoted_attr_value(B, ?INC_COL(S), [C|Acc], Q)
- end.
-
- tokenize_unquoted_attr_value(B, S=#decoder{offset=O}, Acc) ->
- case B of
- <<_:O/binary>> ->
- { iolist_to_binary(lists:reverse(Acc)), S };
- <<_:O/binary, $&, _/binary>> ->
- {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
- tokenize_unquoted_attr_value(B, S1, [Data|Acc]);
- <<_:O/binary, $/, $>, _/binary>> ->
- { iolist_to_binary(lists:reverse(Acc)), S };
- <<_:O/binary, C, _/binary>> when ?PROBABLE_CLOSE(C) ->
- { iolist_to_binary(lists:reverse(Acc)), S };
- <<_:O/binary, C, _/binary>> ->
- tokenize_unquoted_attr_value(B, ?INC_COL(S), [C|Acc])
- end.
- skip_whitespace(B, S=#decoder{offset=O}) ->
- case B of
- <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
- skip_whitespace(B, ?INC_CHAR(S, C));
- _ ->
- S
- end.
- tokenize_literal(Bin, S=#decoder{offset=O}) ->
- case Bin of
- <<_:O/binary, C, _/binary>> when C =:= $>
- orelse C =:= $/
- orelse C =:= $= ->
- %% Handle case where tokenize_literal would consume
- %% 0 chars. http://github.com/mochi/mochiweb/pull/13
- {[C], ?INC_COL(S)};
- _ ->
- tokenize_literal(Bin, S, [])
- end.
- tokenize_literal(Bin, S=#decoder{offset=O}, Acc) ->
- case Bin of
- <<_:O/binary, $&, _/binary>> ->
- {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
- tokenize_literal(Bin, S1, [Data | Acc]);
- <<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C)
- orelse C =:= $>
- orelse C =:= $/
- orelse C =:= $=) ->
- tokenize_literal(Bin, ?INC_COL(S), [C | Acc]);
- _ ->
- {iolist_to_binary(string:to_lower(lists:reverse(Acc))), S}
- end.
- raw_qgt(Bin, S=#decoder{offset=O}) ->
- raw_qgt(Bin, S, O).
- raw_qgt(Bin, S=#decoder{offset=O}, Start) ->
- case Bin of
- <<_:O/binary, "?>", _/binary>> ->
- Len = O - Start,
- <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
- {Raw, ?ADV_COL(S, 2)};
- <<_:O/binary, C, _/binary>> ->
- raw_qgt(Bin, ?INC_CHAR(S, C), Start);
- <<_:O/binary>> ->
- <<_:Start/binary, Raw/binary>> = Bin,
- {Raw, S}
- end.
- find_qgt(Bin, S=#decoder{offset=O}) ->
- case Bin of
- <<_:O/binary, "?>", _/binary>> ->
- ?ADV_COL(S, 2);
- <<_:O/binary, ">", _/binary>> ->
- ?ADV_COL(S, 1);
- <<_:O/binary, "/>", _/binary>> ->
- ?ADV_COL(S, 2);
- %% tokenize_attributes takes care of this state:
- %% <<_:O/binary, C, _/binary>> ->
- %% find_qgt(Bin, ?INC_CHAR(S, C));
- <<_:O/binary>> ->
- S
- end.
- find_gt(Bin, S) ->
- find_gt(Bin, S, false).
- find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->
- case Bin of
- <<_:O/binary, $/, _/binary>> ->
- find_gt(Bin, ?INC_COL(S), true);
- <<_:O/binary, $>, _/binary>> ->
- {?INC_COL(S), HasSlash};
- <<_:O/binary, C, _/binary>> ->
- find_gt(Bin, ?INC_CHAR(S, C), HasSlash);
- _ ->
- {S, HasSlash}
- end.
- tokenize_charref(Bin, S=#decoder{offset=O}) ->
- tokenize_charref(Bin, S, O).
- tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
- case Bin of
- <<_:O/binary>> ->
- <<_:Start/binary, Raw/binary>> = Bin,
- {{data, Raw, false}, S};
- <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C)
- orelse C =:= ?SQUOTE
- orelse C =:= ?QUOTE
- orelse C =:= $/
- orelse C =:= $<
- orelse C =:= $>
- orelse C =:= $& ->
- Len = O - Start,
- <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
- Data = case mochiweb_charref:charref(Raw) of
- undefined ->
- Start1 = Start - 1,
- Len1 = Len + 1,
- <<_:Start1/binary, R:Len1/binary, _/binary>> = Bin,
- R;
- Unichar ->
- mochiutf8:codepoint_to_bytes(Unichar)
- end,
- {{data, Data, false}, S};
- <<_:O/binary, $;, _/binary>> ->
- Len = O - Start,
- <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
- Data = case mochiweb_charref:charref(Raw) of
- undefined ->
- Start1 = Start - 1,
- Len1 = Len + 2,
- <<_:Start1/binary, R:Len1/binary, _/binary>> = Bin,
- R;
- Unichar ->
- mochiutf8:codepoint_to_bytes(Unichar)
- end,
- {{data, Data, false}, ?INC_COL(S)};
- _ ->
- tokenize_charref(Bin, ?INC_COL(S), Start)
- end.
- tokenize_doctype(Bin, S) ->
- tokenize_doctype(Bin, S, []).
- tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) ->
- case Bin of
- <<_:O/binary>> ->
- {{doctype, lists:reverse(Acc)}, S};
- <<_:O/binary, $>, _/binary>> ->
- {{doctype, lists:reverse(Acc)}, ?INC_COL(S)};
- <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
- tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc);
- _ ->
- {Word, S1} = tokenize_word_or_literal(Bin, S),
- tokenize_doctype(Bin, S1, [Word | Acc])
- end.
- tokenize_word_or_literal(Bin, S=#decoder{offset=O}) ->
- case Bin of
- <<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE ->
- tokenize_word(Bin, ?INC_COL(S), C);
- <<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) ->
- %% Sanity check for whitespace
- tokenize_literal(Bin, S)
- end.
- tokenize_word(Bin, S, Quote) ->
- tokenize_word(Bin, S, Quote, []).
- tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) ->
- case Bin of
- <<_:O/binary>> ->
- {iolist_to_binary(lists:reverse(Acc)), S};
- <<_:O/binary, Quote, _/binary>> ->
- {iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)};
- <<_:O/binary, $&, _/binary>> ->
- {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
- tokenize_word(Bin, S1, Quote, [Data | Acc]);
- <<_:O/binary, C, _/binary>> ->
- tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C | Acc])
- end.
- tokenize_cdata(Bin, S=#decoder{offset=O}) ->
- tokenize_cdata(Bin, S, O).
- tokenize_cdata(Bin, S=#decoder{offset=O}, Start) ->
- case Bin of
- <<_:O/binary, "]]>", _/binary>> ->
- Len = O - Start,
- <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
- {{data, Raw, false}, ?ADV_COL(S, 3)};
- <<_:O/binary, C, _/binary>> ->
- tokenize_cdata(Bin, ?INC_CHAR(S, C), Start);
- _ ->
- <<_:O/binary, Raw/binary>> = Bin,
- {{data, Raw, false}, S}
- end.
- tokenize_comment(Bin, S=#decoder{offset=O}) ->
- tokenize_comment(Bin, S, O).
- tokenize_comment(Bin, S=#decoder{offset=O}, Start) ->
- case Bin of
- <<_:O/binary, "-->", _/binary>> ->
- Len = O - Start,
- <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
- {{comment, Raw}, ?ADV_COL(S, 3)};
- <<_:O/binary, C, _/binary>> ->
- tokenize_comment(Bin, ?INC_CHAR(S, C), Start);
- <<_:Start/binary, Raw/binary>> ->
- {{comment, Raw}, S}
- end.
- tokenize_script(Bin, S=#decoder{offset=O}) ->
- tokenize_script(Bin, S, O).
- tokenize_script(Bin, S=#decoder{offset=O}, Start) ->
- case Bin of
- %% Just a look-ahead, we want the end_tag separately
- <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>>
- when (SS =:= $s orelse SS =:= $S) andalso
- (CC =:= $c orelse CC =:= $C) andalso
- (RR =:= $r orelse RR =:= $R) andalso
- (II =:= $i orelse II =:= $I) andalso
- (PP =:= $p orelse PP =:= $P) andalso
- (TT=:= $t orelse TT =:= $T) andalso
- ?PROBABLE_CLOSE(ZZ) ->
- Len = O - Start,
- <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
- {{data, Raw, false}, S};
- <<_:O/binary, C, _/binary>> ->
- tokenize_script(Bin, ?INC_CHAR(S, C), Start);
- <<_:Start/binary, Raw/binary>> ->
- {{data, Raw, false}, S}
- end.
- tokenize_textarea(Bin, S=#decoder{offset=O}) ->
- tokenize_textarea(Bin, S, O).
- tokenize_textarea(Bin, S=#decoder{offset=O}, Start) ->
- case Bin of
- %% Just a look-ahead, we want the end_tag separately
- <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>>
- when (TT =:= $t orelse TT =:= $T) andalso
- (EE =:= $e orelse EE =:= $E) andalso
- (XX =:= $x orelse XX =:= $X) andalso
- (TT2 =:= $t orelse TT2 =:= $T) andalso
- (AA =:= $a orelse AA =:= $A) andalso
- (RR =:= $r orelse RR =:= $R) andalso
- (EE2 =:= $e orelse EE2 =:= $E) andalso
- (AA2 =:= $a orelse AA2 =:= $A) andalso
- ?PROBABLE_CLOSE(ZZ) ->
- Len = O - Start,
- <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
- {{data, Raw, false}, S};
- <<_:O/binary, C, _/binary>> ->
- tokenize_textarea(Bin, ?INC_CHAR(S, C), Start);
- <<_:Start/binary, Raw/binary>> ->
- {{data, Raw, false}, S}
- end.
- %%
- %% Tests
- %%
- -ifdef(TEST).
- -include_lib("eunit/include/eunit.hrl").
- to_html_test() ->
- ?assertEqual(
- <<"<html><head><title>hey!</title></head><body><p class=\"foo\">what's up<br /></p><div>sucka</div>RAW!<!-- comment! --></body></html>">>,
- iolist_to_binary(
- to_html({html, [],
- [{<<"head">>, [],
- [{title, <<"hey!">>}]},
- {body, [],
- [{p, [{class, foo}], [<<"what's">>, <<" up">>, {br}]},
- {'div', <<"sucka">>},
- {'=', <<"RAW!">>},
- {comment, <<" comment! ">>}]}]}))),
- ?assertEqual(
- <<"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">">>,
- iolist_to_binary(
- to_html({doctype,
- [<<"html">>, <<"PUBLIC">>,
- <<"-//W3C//DTD XHTML 1.0 Transitional//EN">>,
- <<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">>]}))),
- ?assertEqual(
- <<"<html><?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?></html>">>,
- iolist_to_binary(
- to_html({<<"html">>,[],
- [{pi, <<"xml:namespace">>,
- [{<<"prefix">>,<<"o">>},
- {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]}))),
- ok.
- escape_test() ->
- ?assertEqual(
- <<"&quot;\"word ><<up!&quot;">>,
- escape(<<""\"word ><<up!"">>)),
- ?assertEqual(
- <<"&quot;\"word ><<up!&quot;">>,
- escape(""\"word ><<up!"")),
- ?assertEqual(
- <<"&quot;\"word ><<up!&quot;">>,
- escape('"\"word ><<up!"')),
- ok.
- escape_attr_test() ->
- ?assertEqual(
- <<"&quot;"word ><<up!&quot;">>,
- escape_attr(<<""\"word ><<up!"">>)),
- ?assertEqual(
- <<"&quot;"word ><<up!&quot;">>,
- escape_attr(""\"word ><<up!"")),
- ?assertEqual(
- <<"&quot;"word ><<up!&quot;">>,
- escape_attr('"\"word ><<up!"')),
- ?assertEqual(
- <<"12345">>,
- escape_attr(12345)),
- ?assertEqual(
- <<"1.5">>,
- escape_attr(1.5)),
- ok.
- tokens_test() ->
- ?assertEqual(
- [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>},
- {<<"wibble">>, <<"wibble">>},
- {<<"alice">>, <<"bob">>}], true}],
- tokens(<<"<foo bar=baz wibble='wibble' alice=\"bob\"/>">>)),
- ?assertEqual(
- [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>},
- {<<"wibble">>, <<"wibble">>},
- {<<"alice">>, <<"bob">>}], true}],
- tokens(<<"<foo bar=baz wibble='wibble' alice=bob/>">>)),
- ?assertEqual(
- [{comment, <<"[if lt IE 7]>\n<style type=\"text/css\">\n.no_ie { display: none; }\n</style>\n<![endif]">>}],
- tokens(<<"<!--[if lt IE 7]>\n<style type=\"text/css\">\n.no_ie { display: none; }\n</style>\n<![endif]-->">>)),
- ?assertEqual(
- [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
- {data, <<" A= B <= C ">>, false},
- {end_tag, <<"script">>}],
- tokens(<<"<script type=\"text/javascript\"> A= B <= C </script>">>)),
- ?assertEqual(
- [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
- {data, <<" A= B <= C ">>, false},
- {end_tag, <<"script">>}],
- tokens(<<"<script type =\"text/javascript\"> A= B <= C </script>">>)),
- ?assertEqual(
- [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
- {data, <<" A= B <= C ">>, false},
- {end_tag, <<"script">>}],
- tokens(<<"<script type = \"text/javascript\"> A= B <= C </script>">>)),
- ?assertEqual(
- [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
- {data, <<" A= B <= C ">>, false},
- {end_tag, <<"script">>}],
- tokens(<<"<script type= \"text/javascript\"> A= B <= C </script>">>)),
- ?assertEqual(
- [{start_tag, <<"textarea">>, [], false},
- {data, <<"<html></body>">>, false},
- {end_tag, <<"textarea">>}],
- tokens(<<"<textarea><html></body></textarea>">>)),
- ?assertEqual(
- [{start_tag, <<"textarea">>, [], false},
- {data, <<"<html></body></textareaz>">>, false}],
- tokens(<<"<textarea ><html></body></textareaz>">>)),
- ?assertEqual(
- [{pi, <<"xml:namespace">>,
- [{<<"prefix">>,<<"o">>},
- {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
- tokens(<<"<?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?>">>)),
- ?assertEqual(
- [{pi, <<"xml:namespace">>,
- [{<<"prefix">>,<<"o">>},
- {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
- tokens(<<"<?xml:namespace prefix=o ns=urn:schemas-microsoft-com:office:office \n?>">>)),
- ?assertEqual(
- [{pi, <<"xml:namespace">>,
- [{<<"prefix">>,<<"o">>},
- {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
- tokens(<<"<?xml:namespace prefix=o ns=urn:schemas-microsoft-com:office:office">>)),
- ?assertEqual(
- [{data, <<"<">>, false}],
- tokens(<<"<">>)),
- ?assertEqual(
- [{data, <<"not html ">>, false},
- {data, <<"< at all">>, false}],
- tokens(<<"not html < at all">>)),
- ok.
- parse_test() ->
- D0 = <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">
- <html>
- <head>
- <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
- <title>Foo</title>
- <link rel=\"stylesheet\" type=\"text/css\" href=\"/static/rel/dojo/resources/dojo.css\" media=\"screen\">
- <link rel=\"stylesheet\" type=\"text/css\" href=\"/static/foo.css\" media=\"screen\">
- <!--[if lt IE 7]>
- <style type=\"text/css\">
- .no_ie { display: none; }
- </style>
- <![endif]-->
- <link rel=\"icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
- <link rel=\"shortcut icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
- </head>
- <body id=\"home\" class=\"tundra\"><![CDATA[<<this<!-- is -->CDATA>>]]></body>
- </html>">>,
- ?assertEqual(
- {<<"html">>, [],
- [{<<"head">>, [],
- [{<<"meta">>,
- [{<<"http-equiv">>,<<"Content-Type">>},
- {<<"content">>,<<"text/html; charset=UTF-8">>}],
- []},
- {<<"title">>,[],[<<"Foo">>]},
- {<<"link">>,
- [{<<"rel">>,<<"stylesheet">>},
- {<<"type">>,<<"text/css">>},
- {<<"href">>,<<"/static/rel/dojo/resources/dojo.css">>},
- {<<"media">>,<<"screen">>}],
- []},
- {<<"link">>,
- [{<<"rel">>,<<"stylesheet">>},
- {<<"type">>,<<"text/css">>},
- {<<"href">>,<<"/static/foo.css">>},
- {<<"media">>,<<"screen">>}],
- []},
- {comment,<<"[if lt IE 7]>\n <style type=\"text/css\">\n .no_ie { display: none; }\n </style>\n <![endif]">>},
- {<<"link">>,
- [{<<"rel">>,<<"icon">>},
- {<<"href">>,<<"/static/images/favicon.ico">>},
- {<<"type">>,<<"image/x-icon">>}],
- []},
- {<<"link">>,
- [{<<"rel">>,<<"shortcut icon">>},
- {<<"href">>,<<"/static/images/favicon.ico">>},
- {<<"type">>,<<"image/x-icon">>}],
- []}]},
- {<<"body">>,
- [{<<"id">>,<<"home">>},
- {<<"class">>,<<"tundra">>}],
- [<<"<<this<!-- is -->CDATA>>">>]}]},
- parse(D0)),
- ?assertEqual(
- {<<"html">>,[],
- [{pi, <<"xml:namespace">>,
- [{<<"prefix">>,<<"o">>},
- {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]},
- parse(
- <<"<html><?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?></html>">>)),
- ?assertEqual(
- {<<"html">>, [],
- [{<<"dd">>, [], [<<"foo">>]},
- {<<"dt">>, [], [<<"bar">>]}]},
- parse(<<"<html><dd>foo<dt>bar</html>">>)),
- %% Singleton sadness
- ?assertEqual(
- {<<"html">>, [],
- [{<<"link">>, [], []},
- <<"foo">>,
- {<<"br">>, [], []},
- <<"bar">>]},
- parse(<<"<html><link>foo<br>bar</html>">>)),
- ?assertEqual(
- {<<"html">>, [],
- [{<<"link">>, [], [<<"foo">>,
- {<<"br">>, [], []},
- <<"bar">>]}]},
- parse(<<"<html><link>foo<br>bar</link></html>">>)),
- %% Case insensitive tags
- ?assertEqual(
- {<<"html">>, [],
- [{<<"head">>, [], [<<"foo">>,
- {<<"br">>, [], []},
- <<"BAR">>]},
- {<<"body">>, [{<<"class">>, <<"">>}, {<<"bgcolor">>, <<"#Aa01fF">>}], []}
- ]},
- parse(<<"<html><Head>foo<bR>BAR</head><body Class=\"\" bgcolor=\"#Aa01fF\"></BODY></html>">>)),
- ok.
- exhaustive_is_singleton_test() ->
- T = mochiweb_cover:clause_lookup_table(?MODULE, is_singleton),
- [?assertEqual(V, is_singleton(K)) || {K, V} <- T].
- tokenize_attributes_test() ->
- ?assertEqual(
- {<<"foo">>,
- [{<<"bar">>, <<"b\"az">>},
- {<<"wibble">>, <<"wibble">>},
- {<<"taco", 16#c2, 16#a9>>, <<"bell">>},
- {<<"quux">>, <<"quux">>}],
- []},
- parse(<<"<foo bar=\"b"az\" wibble taco©=bell quux">>)),
- ok.
- tokens2_test() ->
- D0 = <<"<channel><title>from __future__ import *</title><link>http://bob.pythonmac.org</link><description>Bob's Rants</description></channel>">>,
- ?assertEqual(
- [{start_tag,<<"channel">>,[],false},
- {start_tag,<<"title">>,[],false},
- {data,<<"from __future__ import *">>,false},
- {end_tag,<<"title">>},
- {start_tag,<<"link">>,[],true},
- {data,<<"http://bob.pythonmac.org">>,false},
- {end_tag,<<"link">>},
- {start_tag,<<"description">>,[],false},
- {data,<<"Bob's Rants">>,false},
- {end_tag,<<"description">>},
- {end_tag,<<"channel">>}],
- tokens(D0)),
- ok.
- to_tokens_test() ->
- ?assertEqual(
- [{start_tag, <<"p">>, [{class, 1}], false},
- {end_tag, <<"p">>}],
- to_tokens({p, [{class, 1}], []})),
- ?assertEqual(
- [{start_tag, <<"p">>, [], false},
- {end_tag, <<"p">>}],
- to_tokens({p})),
- ?assertEqual(
- [{'=', <<"data">>}],
- to_tokens({'=', <<"data">>})),
- ?assertEqual(
- [{comment, <<"comment">>}],
- to_tokens({comment, <<"comment">>})),
- %% This is only allowed in sub-tags:
- %% {p, [{"class", "foo"}]} as {p, [{"class", "foo"}], []}
- %% On the outside it's always treated as follows:
- %% {p, [], [{"class", "foo"}]} as {p, [], [{"class", "foo"}]}
- ?assertEqual(
- [{start_tag, <<"html">>, [], false},
- {start_tag, <<"p">>, [{class, 1}], false},
- {end_tag, <<"p">>},
- {end_tag, <<"html">>}],
- to_tokens({html, [{p, [{class, 1}]}]})),
- ok.
- parse2_test() ->
- D0 = <<"<channel><title>from __future__ import *</title><link>http://bob.pythonmac.org<br>foo</link><description>Bob's Rants</description></channel>">>,
- ?assertEqual(
- {<<"channel">>,[],
- [{<<"title">>,[],[<<"from __future__ import *">>]},
- {<<"link">>,[],[
- <<"http://bob.pythonmac.org">>,
- {<<"br">>,[],[]},
- <<"foo">>]},
- {<<"description">>,[],[<<"Bob's Rants">>]}]},
- parse(D0)),
- ok.
- parse_tokens_test() ->
- D0 = [{doctype,[<<"HTML">>,<<"PUBLIC">>,<<"-//W3C//DTD HTML 4.01 Transitional//EN">>]},
- {data,<<"\n">>,true},
- {start_tag,<<"html">>,[],false}],
- ?assertEqual(
- {<<"html">>, [], []},
- parse_tokens(D0)),
- D1 = D0 ++ [{end_tag, <<"html">>}],
- ?assertEqual(
- {<<"html">>, [], []},
- parse_tokens(D1)),
- D2 = D0 ++ [{start_tag, <<"body">>, [], false}],
- ?assertEqual(
- {<<"html">>, [], [{<<"body">>, [], []}]},
- parse_tokens(D2)),
- D3 = D0 ++ [{start_tag, <<"head">>, [], false},
- {end_tag, <<"head">>},
- {start_tag, <<"body">>, [], false}],
- ?assertEqual(
- {<<"html">>, [], [{<<"head">>, [], []}, {<<"body">>, [], []}]},
- parse_tokens(D3)),
- D4 = D3 ++ [{data,<<"\n">>,true},
- {start_tag,<<"div">>,[{<<"class">>,<<"a">>}],false},
- {start_tag,<<"a">>,[{<<"name">>,<<"#anchor">>}],false},
- {end_tag,<<"a">>},
- {end_tag,<<"div">>},
- {start_tag,<<"div">>,[{<<"class">>,<<"b">>}],false},
- {start_tag,<<"div">>,[{<<"class">>,<<"c">>}],false},
- {end_tag,<<"div">>},
- {end_tag,<<"div">>}],
- ?assertEqual(
- {<<"html">>, [],
- [{<<"head">>, [], []},
- {<<"body">>, [],
- [{<<"div">>, [{<<"class">>, <<"a">>}], [{<<"a">>, [{<<"name">>, <<"#anchor">>}], []}]},
- {<<"div">>, [{<<"class">>, <<"b">>}], [{<<"div">>, [{<<"class">>, <<"c">>}], []}]}
- ]}]},
- parse_tokens(D4)),
- D5 = [{start_tag,<<"html">>,[],false},
- {data,<<"\n">>,true},
- {data,<<"boo">>,false},
- {data,<<"hoo">>,false},
- {data,<<"\n">>,true},
- {end_tag,<<"html">>}],
- ?assertEqual(
- {<<"html">>, [], [<<"\nboohoo\n">>]},
- parse_tokens(D5)),
- D6 = [{start_tag,<<"html">>,[],false},
- {data,<<"\n">>,true},
- {data,<<"\n">>,true},
- {end_tag,<<"html">>}],
- ?assertEqual(
- {<<"html">>, [], []},
- parse_tokens(D6)),
- D7 = [{start_tag,<<"html">>,[],false},
- {start_tag,<<"ul">>,[],false},
- {start_tag,<<"li">>,[],false},
- {data,<<"word">>,false},
- {start_tag,<<"li">>,[],false},
- {data,<<"up">>,false},
- {end_tag,<<"li">>},
- {start_tag,<<"li">>,[],false},
- {data,<<"fdsa">>,false},
- {start_tag,<<"br">>,[],true},
- {data,<<"asdf">>,false},
- {end_tag,<<"ul">>},
- {end_tag,<<"html">>}],
- ?assertEqual(
- {<<"html">>, [],
- [{<<"ul">>, [],
- [{<<"li">>, [], [<<"word">>]},
- {<<"li">>, [], [<<"up">>]},
- {<<"li">>, [], [<<"fdsa">>,{<<"br">>, [], []}, <<"asdf">>]}]}]},
- parse_tokens(D7)),
- ok.
- destack_test() ->
- {<<"a">>, [], []} =
- destack([{<<"a">>, [], []}]),
- {<<"a">>, [], [{<<"b">>, [], []}]} =
- destack([{<<"b">>, [], []}, {<<"a">>, [], []}]),
- {<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]} =
- destack([{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
- [{<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]}] =
- destack(<<"b">>,
- [{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
- [{<<"b">>, [], [{<<"c">>, [], []}]}, {<<"a">>, [], []}] =
- destack(<<"c">>,
- [{<<"c">>, [], []}, {<<"b">>, [], []},{<<"a">>, [], []}]),
- ok.
- doctype_test() ->
- ?assertEqual(
- {<<"html">>,[],[{<<"head">>,[],[]}]},
- mochiweb_html:parse("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"
- "<html><head></head></body></html>")),
- %% http://code.google.com/p/mochiweb/issues/detail?id=52
- ?assertEqual(
- {<<"html">>,[],[{<<"head">>,[],[]}]},
- mochiweb_html:parse("<html>"
- "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"
- "<head></head></body></html>")),
- %% http://github.com/mochi/mochiweb/pull/13
- ?assertEqual(
- {<<"html">>,[],[{<<"head">>,[],[]}]},
- mochiweb_html:parse("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\"/>"
- "<html>"
- "<head></head></body></html>")),
- ok.
- dumb_br_test() ->
- %% http://code.google.com/p/mochiweb/issues/detail?id=71
- ?assertEqual(
- {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>]},
- mochiweb_html:parse("<div><br/><br/>z</br/></br/></div>")),
- ?assertEqual(
- {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>]},
- mochiweb_html:parse("<div><br><br>z</br/></br/></div>")),
- ?assertEqual(
- {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>, {<<"br">>, [], []}, {<<"br">>, [], []}]},
- mochiweb_html:parse("<div><br><br>z<br/><br/></div>")),
- ?assertEqual(
- {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>]},
- mochiweb_html:parse("<div><br><br>z</br></br></div>")).
- php_test() ->
- %% http://code.google.com/p/mochiweb/issues/detail?id=71
- ?assertEqual(
- [{pi, <<"php\n">>}],
- mochiweb_html:tokens(
- "<?php\n?>")),
- ?assertEqual(
- {<<"div">>, [], [{pi, <<"php\n">>}]},
- mochiweb_html:parse(
- "<div><?php\n?></div>")),
- ok.
- parse_unquoted_attr_test() ->
- D0 = <<"<html><img src=/images/icon.png/></html>">>,
- ?assertEqual(
- {<<"html">>,[],[
- { <<"img">>, [ { <<"src">>, <<"/images/icon.png">> } ], [] }
- ]},
- mochiweb_html:parse(D0)),
-
- D1 = <<"<html><img src=/images/icon.png></img></html>">>,
- ?assertEqual(
- {<<"html">>,[],[
- { <<"img">>, [ { <<"src">>, <<"/images/icon.png">> } ], [] }
- ]},
- mochiweb_html:parse(D1)),
-
- D2 = <<"<html><img src=/images/icon>.png width=100></img></html>">>,
- ?assertEqual(
- {<<"html">>,[],[
- { <<"img">>, [ { <<"src">>, <<"/images/icon>.png">> }, { <<"width">>, <<"100">> } ], [] }
- ]},
- mochiweb_html:parse(D2)),
- ok.
-
- parse_quoted_attr_test() ->
- D0 = <<"<html><img src='/images/icon.png'></html>">>,
- ?assertEqual(
- {<<"html">>,[],[
- { <<"img">>, [ { <<"src">>, <<"/images/icon.png">> } ], [] }
- ]},
- mochiweb_html:parse(D0)),
-
- D1 = <<"<html><img src=\"/images/icon.png'></html>">>,
- ?assertEqual(
- {<<"html">>,[],[
- { <<"img">>, [ { <<"src">>, <<"/images/icon.png'></html>">> } ], [] }
- ]},
- mochiweb_html:parse(D1)),
- D2 = <<"<html><img src=\"/images/icon>.png\"></html>">>,
- ?assertEqual(
- {<<"html">>,[],[
- { <<"img">>, [ { <<"src">>, <<"/images/icon>.png">> } ], [] }
- ]},
- mochiweb_html:parse(D2)),
- ok.
- parse_missing_attr_name_test() ->
- D0 = <<"<html =black></html>">>,
- ?assertEqual(
- {<<"html">>, [ { <<"=">>, <<"=">> }, { <<"black">>, <<"black">> } ], [] },
- mochiweb_html:parse(D0)),
- ok.
- parse_amps_attr_test() ->
- D0 = <<"<a href=\"/hello?test=1&that=2\"></a>">>,
- ?assertEqual(
- {<<"a">>, [ { <<"href">>, <<"/hello?test=1&that=2">> }], [] },
- mochiweb_html:parse(D0)),
-
- D1 = <<"<a href=\"/hello?test=1&that=2\"></a>">>,
- ?assertEqual(
- {<<"a">>, [ { <<"href">>, <<"/hello?test=1&that=2">> }], [] },
- mochiweb_html:parse(D1)),
- D2 = <<"<a href=\"/hello?test=123&that=2&this=too\"></a>">>,
- ?assertEqual(
- {<<"a">>, [ { <<"href">>, <<"/hello?test=123&that=2&this=too">> }], [] },
- mochiweb_html:parse(D2)),
- D3 = <<"<a href=\"/product/54?c=hk-machine&id=1008&shop=auto-oko-74-H\"></a>">>,
- ?assertEqual(
- {<<"a">>, [ { <<"href">>, <<"/product/54?c=hk-machine&id=1008&shop=auto-oko-74-H">> }], [] },
- mochiweb_html:parse(D3)),
- D4 = <<"<a href=\"test?a=1&=1008\"></a>">>,
- ?assertEqual(
- {<<"a">>, [ { <<"href">>, <<"test?a=1&=1008">> }], [] },
- mochiweb_html:parse(D4)),
-
- ok.
- parse_broken_pi_test() ->
- D0 = <<"<html><?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" /></html>">>,
- ?assertEqual(
- {<<"html">>, [], [
- { pi, <<"xml:namespace">>, [ { <<"prefix">>, <<"o">> },
- { <<"ns">>, <<"urn:schemas-microsoft-com:office:office">> } ] }
- ] },
- mochiweb_html:parse(D0)),
- ok.
- parse_funny_singletons_test() ->
- D0 = <<"<html><input><input>x</input></input></html>">>,
- ?assertEqual(
- {<<"html">>, [], [
- { <<"input">>, [], [] },
- { <<"input">>, [], [ <<"x">> ] }
- ] },
- mochiweb_html:parse(D0)),
- ok.
- parse_charref_test() ->
- %% Normal charref
- D0 = <<"<div>&</div>">>,
- ?assertEqual(
- {<<"div">>, [], [<<"&">>]},
- mochiweb_html:parse(D0)),
- %% Missing semicolon in the middle.
- D1 = <<"<div>& &</div>">>,
- ?assertEqual(
- {<<"div">>, [], [<<"& &">>]},
- mochiweb_html:parse(D1)),
- %% Missing semicolon on the last enitity
- D2 = <<"<div>& &</div>">>,
- ?assertEqual(
- {<<"div">>, [], [<<"& &">>]},
- mochiweb_html:parse(D2)),
- D3 = <<"<div>&&</div>">>,
- ?assertEqual(
- {<<"div">>, [], [<<"&&">>]},
- mochiweb_html:parse(D3)),
- D4 = <<"<div>&</div>">>,
- ?assertEqual(
- {<<"div">>, [], [<<"&">>]},
- mochiweb_html:parse(D4)),
- ok.
- parse_charref_garbage_in_garbage_out_test() ->
- %% faulty charref is left alone
- D1 = <<"<div>&. test</div>">>,
- ?assertEqual(
- {<<"div">>, [], [<<"&. test">>]},
- mochiweb_html:parse(D1)),
-
- ok.
-
- -endif.