PageRenderTime 151ms CodeModel.GetById 10ms app.highlight 127ms RepoModel.GetById 1ms app.codeStats 1ms

/deps/mochiweb/src/mochiweb_html.erl

https://code.google.com/p/zotonic/
Erlang | 1352 lines | 1169 code | 105 blank | 78 comment | 2 complexity | 15edba7ca92133861eba435943f0e97f MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1%% @author Bob Ippolito <bob@mochimedia.com>
   2%% @copyright 2007 Mochi Media, Inc.
   3
   4%% @doc Loosely tokenizes and generates parse trees for HTML 4.
   5-module(mochiweb_html).
   6-export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1,
   7         escape_attr/1, to_html/1]).
   8
   9%% This is a macro to placate syntax highlighters..
  10-define(QUOTE, $\").
  11-define(SQUOTE, $\').
  12-define(ADV_COL(S, N),
  13        S#decoder{column=N+S#decoder.column,
  14                  offset=N+S#decoder.offset}).
  15-define(INC_COL(S),
  16        S#decoder{column=1+S#decoder.column,
  17                  offset=1+S#decoder.offset}).
  18-define(INC_LINE(S),
  19        S#decoder{column=1,
  20                  line=1+S#decoder.line,
  21                  offset=1+S#decoder.offset}).
  22-define(INC_CHAR(S, C),
  23        case C of
  24            $\n ->
  25                S#decoder{column=1,
  26                          line=1+S#decoder.line,
  27                          offset=1+S#decoder.offset};
  28            _ ->
  29                S#decoder{column=1+S#decoder.column,
  30                          offset=1+S#decoder.offset}
  31        end).
  32
  33-define(IS_WHITESPACE(C),
  34        (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
  35-define(IS_LITERAL_SAFE(C),
  36        ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)
  37         orelse (C >= $0 andalso C =< $9))).
  38-define(PROBABLE_CLOSE(C),
  39        (C =:= $> orelse ?IS_WHITESPACE(C))).
  40
  41-record(decoder, {line=1,
  42                  column=1,
  43                  offset=0}).
  44
  45%% @type html_node() = {string(), [html_attr()], [html_node() | string()]}
  46%% @type html_attr() = {string(), string()}
  47%% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype()
  48%% @type html_data() = {data, string(), Whitespace::boolean()}
  49%% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()}
  50%% @type end_tag() = {end_tag, Name}
  51%% @type html_comment() = {comment, Comment}
  52%% @type html_doctype() = {doctype, [Doctype]}
  53%% @type inline_html() = {'=', iolist()}
  54
  55%% External API.
  56
  57%% @spec parse(string() | binary()) -> html_node()
  58%% @doc tokenize and then transform the token stream into a HTML tree.
  59parse(Input) ->
  60    parse_tokens(tokens(Input)).
  61
  62%% @spec parse_tokens([html_token()]) -> html_node()
  63%% @doc Transform the output of tokens(Doc) into a HTML tree.
  64parse_tokens(Tokens) when is_list(Tokens) ->
  65    %% Skip over doctype, processing instructions
  66    F = fun (X) ->
  67                case X of
  68                    {start_tag, _, _, false} ->
  69                        false;
  70                    _ ->
  71                        true
  72                end
  73        end,
  74    [{start_tag, Tag, Attrs, false} | Rest] = lists:dropwhile(F, Tokens),
  75    {Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
  76    Tree.
  77
  78%% @spec tokens(StringOrBinary) -> [html_token()]
  79%% @doc Transform the input UTF-8 HTML into a token stream.
  80tokens(Input) ->
  81    tokens(iolist_to_binary(Input), #decoder{}, []).
  82
  83%% @spec to_tokens(html_node()) -> [html_token()]
  84%% @doc Convert a html_node() tree to a list of tokens.
  85to_tokens({Tag0}) ->
  86    to_tokens({Tag0, [], []});
  87to_tokens(T={'=', _}) ->
  88    [T];
  89to_tokens(T={doctype, _}) ->
  90    [T];
  91to_tokens(T={comment, _}) ->
  92    [T];
  93to_tokens({Tag0, Acc}) ->
  94    %% This is only allowed in sub-tags: {p, [{"class", "foo"}]}
  95    to_tokens({Tag0, [], Acc});
  96to_tokens({Tag0, Attrs, Acc}) ->
  97    Tag = to_tag(Tag0),
  98    to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, is_singleton(Tag)}]).
  99
 100%% @spec to_html([html_token()] | html_node()) -> iolist()
 101%% @doc Convert a list of html_token() to a HTML document.
 102to_html(Node) when is_tuple(Node) ->
 103    to_html(to_tokens(Node));
 104to_html(Tokens) when is_list(Tokens) ->
 105    to_html(Tokens, []).
 106
 107%% @spec escape(string() | atom() | binary()) -> binary()
 108%% @doc Escape a string such that it's safe for HTML (amp; lt; gt;).
 109escape(B) when is_binary(B) ->
 110    escape(binary_to_list(B), []);
 111escape(A) when is_atom(A) ->
 112    escape(atom_to_list(A), []);
 113escape(S) when is_list(S) ->
 114    escape(S, []).
 115
 116%% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary()
 117%% @doc Escape a string such that it's safe for HTML attrs
 118%%      (amp; lt; gt; quot;).
 119escape_attr(B) when is_binary(B) ->
 120    escape_attr(binary_to_list(B), []);
 121escape_attr(A) when is_atom(A) ->
 122    escape_attr(atom_to_list(A), []);
 123escape_attr(S) when is_list(S) ->
 124    escape_attr(S, []);
 125escape_attr(I) when is_integer(I) ->
 126    escape_attr(integer_to_list(I), []);
 127escape_attr(F) when is_float(F) ->
 128    escape_attr(mochinum:digits(F), []).
 129
 130to_html(Tree, Acc) ->
 131    to_html(Tree, Acc, true).
 132
 133to_html([], Acc, _Escape) ->
 134    lists:reverse(Acc);
 135to_html([{'=', Content} | Rest], Acc, Escape) ->
 136    to_html(Rest, [Content | Acc], Escape);
 137to_html([{pi, Bin} | Rest], Acc, Escape) ->
 138    Open = [<<"<?">>,
 139            Bin,
 140            <<"?>">>],
 141    to_html(Rest, [Open | Acc], Escape);
 142to_html([{pi, Tag, Attrs} | Rest], Acc, Escape) ->
 143    Open = [<<"<?">>,
 144            Tag,
 145            attrs_to_html(Attrs, []),
 146            <<"?>">>],
 147    to_html(Rest, [Open | Acc], Escape);
 148to_html([{comment, Comment} | Rest], Acc, Escape) ->
 149    to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] | Acc], Escape);
 150to_html([{doctype, Parts} | Rest], Acc, Escape) ->
 151    Inside = doctype_to_html(Parts, Acc),
 152    to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] | Acc], Escape);
 153to_html([{data, Data, _Whitespace} | Rest], Acc, true) ->
 154    to_html(Rest, [escape(Data) | Acc], true);
 155to_html([{data, Data, _Whitespace} | Rest], Acc, false) ->
 156    to_html(Rest, [Data | Acc], false);
 157to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc, _Escape) ->
 158    EscapeData = case Tag of 
 159                     <<"script">> -> false;
 160                     _ -> true
 161                 end,
 162    Open = [<<"<">>,
 163            Tag,
 164            attrs_to_html(Attrs, []),
 165            case Singleton of
 166                true -> <<" />">>;
 167                false -> <<">">>
 168            end],
 169    to_html(Rest, [Open | Acc], EscapeData);
 170to_html([{end_tag, Tag} | Rest], Acc, _Escape) ->
 171    to_html(Rest, [[<<"</">>, Tag, <<">">>] | Acc], false).
 172
 173doctype_to_html([], Acc) ->
 174    lists:reverse(Acc);
 175doctype_to_html([Word | Rest], Acc) ->
 176    case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end,
 177                   binary_to_list(iolist_to_binary(Word))) of
 178        true ->
 179            doctype_to_html(Rest, [[<<" ">>, Word] | Acc]);
 180        false ->
 181            doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc])
 182    end.
 183
 184attrs_to_html([], Acc) ->
 185    lists:reverse(Acc);
 186attrs_to_html([{K, V} | Rest], Acc) ->
 187    attrs_to_html(Rest,
 188                  [[<<" ">>, escape(K), <<"=\"">>,
 189                    escape_attr(V), <<"\"">>] | Acc]).
 190
 191escape([], Acc) ->
 192    list_to_binary(lists:reverse(Acc));
 193escape("<" ++ Rest, Acc) ->
 194    escape(Rest, lists:reverse("&lt;", Acc));
 195escape(">" ++ Rest, Acc) ->
 196    escape(Rest, lists:reverse("&gt;", Acc));
 197escape("&" ++ Rest, Acc) ->
 198    escape(Rest, lists:reverse("&amp;", Acc));
 199escape([C | Rest], Acc) ->
 200    escape(Rest, [C | Acc]).
 201
 202escape_attr([], Acc) ->
 203    list_to_binary(lists:reverse(Acc));
 204escape_attr("<" ++ Rest, Acc) ->
 205    escape_attr(Rest, lists:reverse("&lt;", Acc));
 206escape_attr(">" ++ Rest, Acc) ->
 207    escape_attr(Rest, lists:reverse("&gt;", Acc));
 208escape_attr("&" ++ Rest, Acc) ->
 209    escape_attr(Rest, lists:reverse("&amp;", Acc));
 210escape_attr([?QUOTE | Rest], Acc) ->
 211    escape_attr(Rest, lists:reverse("&quot;", Acc));
 212escape_attr([C | Rest], Acc) ->
 213    escape_attr(Rest, [C | Acc]).
 214
 215to_tag(A) when is_atom(A) ->
 216    norm(atom_to_list(A));
 217to_tag(L) ->
 218    norm(L).
 219
 220to_tokens([], Acc) ->
 221    lists:reverse(Acc);
 222to_tokens([{Tag, []} | Rest], Acc) ->
 223    to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]);
 224to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) ->
 225    %% Allow {br}
 226    to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc);
 227to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) ->
 228    %% Allow {'=', iolist()}
 229    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
 230to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) ->
 231    %% Allow {comment, iolist()}
 232    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
 233to_tokens([{Tag0, [T0={pi, _S0} | R1]} | Rest], Acc) ->
 234    %% Allow {pi, binary()}
 235    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
 236to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) ->
 237    %% Allow {pi, binary(), list()}
 238    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
 239to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) ->
 240    %% Allow {p, [{"class", "foo"}]}
 241    to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc);
 242to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) ->
 243    %% Allow {p, "content"} and {p, <<"content">>}
 244    to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc);
 245to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) ->
 246    %% Allow {"p", [{"class", "foo"}], <<"content">>}
 247    to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc);
 248to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc)
 249  when is_integer(C) ->
 250    %% Allow {"p", [{"class", "foo"}], "content"}
 251    to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc);
 252to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) ->
 253    %% Native {"p", [{"class", "foo"}], ["content"]}
 254    Tag = to_tag(Tag0),
 255    T1 = to_tag(T0),
 256    case is_singleton(norm(T1)) of
 257        true ->
 258            to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]);
 259        false ->
 260            to_tokens([{T1, C1}, {Tag, R1} | Rest],
 261                      [{start_tag, T1, A1, false} | Acc])
 262    end;
 263to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) ->
 264    %% List text
 265    Tag = to_tag(Tag0),
 266    to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]);
 267to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) ->
 268    %% Binary text
 269    Tag = to_tag(Tag0),
 270    to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]).
 271
 272tokens(B, S=#decoder{offset=O}, Acc) ->
 273    case B of
 274        <<_:O/binary>> ->
 275            lists:reverse(Acc);
 276        _ ->
 277            {Tag, S1} = tokenize(B, S),
 278            case parse_flag(Tag) of
 279                script ->
 280                    {Tag2, S2} = tokenize_script(B, S1),
 281                    tokens(B, S2, [Tag2, Tag | Acc]);
 282                textarea ->
 283                    {Tag2, S2} = tokenize_textarea(B, S1),
 284                    tokens(B, S2, [Tag2, Tag | Acc]);
 285                none ->
 286                    tokens(B, S1, [Tag | Acc])
 287            end
 288    end.
 289
 290parse_flag({start_tag, B, _, false}) ->
 291    case string:to_lower(binary_to_list(B)) of
 292        "script" ->
 293            script;
 294        "textarea" ->
 295            textarea;
 296        _ ->
 297            none
 298    end;
 299parse_flag(_) ->
 300    none.
 301
 302tokenize(B, S=#decoder{offset=O}) ->
 303    case B of
 304        <<_:O/binary, "<!--", _/binary>> ->
 305            tokenize_comment(B, ?ADV_COL(S, 4));
 306        <<_:O/binary, "<!DOCTYPE", _/binary>> ->
 307            tokenize_doctype(B, ?ADV_COL(S, 10));
 308        <<_:O/binary, "<![CDATA[", _/binary>> ->
 309            tokenize_cdata(B, ?ADV_COL(S, 9));
 310        <<_:O/binary, "<?php", _/binary>> ->
 311            {Body, S1} = raw_qgt(B, ?ADV_COL(S, 2)),
 312            {{pi, Body}, S1};
 313        <<_:O/binary, "<?", _/binary>> ->
 314            {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
 315            {Attrs, S2} = tokenize_attributes(B, S1),
 316            S3 = find_qgt(B, S2),
 317            {{pi, Tag, Attrs}, S3};
 318        <<_:O/binary, "&", _/binary>> ->
 319            tokenize_charref(B, ?INC_COL(S));
 320        <<_:O/binary, "</", _/binary>> ->
 321            {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
 322            {S2, _} = find_gt(B, S1),
 323            {{end_tag, Tag}, S2};
 324        <<_:O/binary, "<", C, _/binary>> when ?IS_WHITESPACE(C) ->
 325            %% This isn't really strict HTML
 326            {{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)),
 327            {{data, <<$<, Data/binary>>, false}, S1};
 328        <<_:O/binary, "<", _/binary>> ->
 329            {Tag, S1} = tokenize_literal(B, ?INC_COL(S)),
 330            {Attrs, S2} = tokenize_attributes(B, S1),
 331            {S3, HasSlash} = find_gt(B, S2),
 332            Singleton = HasSlash orelse is_singleton(Tag),
 333            {{start_tag, Tag, Attrs, Singleton}, S3};
 334        _ ->
 335            tokenize_data(B, S)
 336    end.
 337
 338tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
 339    tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]);
 340tree_data(Rest, AllWhitespace, Acc) ->
 341    {iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.
 342
 343tree([], Stack) ->
 344    {destack(Stack), []};
 345tree([{end_tag, Tag} | Rest], Stack) ->
 346    case destack(norm(Tag), Stack) of
 347        S when is_list(S) ->
 348            tree(Rest, S);
 349        Result ->
 350            {Result, []}
 351    end;
 352tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
 353    tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
 354tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
 355    tree(Rest, stack(norm({Tag, Attrs}), S));
 356tree([T={pi, _Raw} | Rest], S) ->
 357    tree(Rest, append_stack_child(T, S));
 358tree([T={pi, _Tag, _Attrs} | Rest], S) ->
 359    tree(Rest, append_stack_child(T, S));
 360tree([T={comment, _Comment} | Rest], S) ->
 361    tree(Rest, append_stack_child(T, S));
 362tree(L=[{data, _Data, _Whitespace} | _], S) ->
 363    case tree_data(L, true, []) of
 364        {_, true, Rest} ->
 365            tree(Rest, S);
 366        {Data, false, Rest} ->
 367            tree(Rest, append_stack_child(Data, S))
 368    end;
 369tree([{doctype, _} | Rest], Stack) ->
 370    tree(Rest, Stack).
 371
 372norm({Tag, Attrs}) ->
 373    {norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
 374norm(Tag) when is_binary(Tag) ->
 375    Tag;
 376norm(Tag) ->
 377    list_to_binary(string:to_lower(Tag)).
 378
 379stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest])
 380  when TN =:= <<"li">> orelse TN =:= <<"option">> ->
 381    [T1 | destack(TN, Stack)];
 382stack(T1={TN0, _, _}, Stack=[{TN1, _, _} | _Rest])
 383  when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso
 384       (TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) ->
 385    [T1 | destack(TN1, Stack)];
 386stack(T1, Stack) ->
 387    [T1 | Stack].
 388
 389append_stack_child(StartTag, [{Name, Attrs, Acc} | Stack]) ->
 390    [{Name, Attrs, [StartTag | Acc]} | Stack].
 391
 392destack(<<"br">>, Stack) ->
 393    %% This is an ugly hack to make dumb_br_test() pass,
 394    %% this makes it such that br can never have children.
 395    Stack;
 396destack(TagName, Stack) when is_list(Stack) ->
 397    F = fun (X) ->
 398                case X of
 399                    {TagName, _, _} ->
 400                        false;
 401                    _ ->
 402                        true
 403                end
 404        end,
 405    case lists:splitwith(F, Stack) of
 406        {_, []} ->
 407            %% If we're parsing something like XML we might find
 408            %% a <link>tag</link> that is normally a singleton
 409            %% in HTML but isn't here
 410            case {is_singleton(TagName), Stack} of
 411                {true, [{T0, A0, Acc0} | Post0]} ->
 412                    case lists:splitwith(F, Acc0) of
 413                        {_, []} ->
 414                            %% Actually was a singleton
 415                            Stack;
 416                        {Pre, [{T1, A1, Acc1} | Post1]} ->
 417                            [{T0, A0, [{T1, A1, Acc1 ++ lists:reverse(Pre)} | Post1]}
 418                             | Post0]
 419                    end;
 420                _ ->
 421                    %% No match, no state change
 422                    Stack
 423            end;
 424        {_Pre, [_T]} ->
 425            %% Unfurl the whole stack, we're done
 426            destack(Stack);
 427        {Pre, [T, {T0, A0, Acc0} | Post]} ->
 428            %% Unfurl up to the tag, then accumulate it
 429            [{T0, A0, [destack(Pre ++ [T]) | Acc0]} | Post]
 430    end.
 431
 432destack([{Tag, Attrs, Acc}]) ->
 433    {Tag, Attrs, lists:reverse(Acc)};
 434destack([{T1, A1, Acc1}, {T0, A0, Acc0} | Rest]) ->
 435    destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} | Acc0]} | Rest]).
 436
 437is_singleton(<<"br">>) -> true;
 438is_singleton(<<"hr">>) -> true;
 439is_singleton(<<"img">>) -> true;
 440is_singleton(<<"input">>) -> true;
 441is_singleton(<<"base">>) -> true;
 442is_singleton(<<"meta">>) -> true;
 443is_singleton(<<"link">>) -> true;
 444is_singleton(<<"area">>) -> true;
 445is_singleton(<<"param">>) -> true;
 446is_singleton(<<"col">>) -> true;
 447is_singleton(_) -> false.
 448
 449tokenize_data(B, S=#decoder{offset=O}) ->
 450    tokenize_data(B, S, O, true).
 451
 452tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) ->
 453    case B of
 454        <<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) ->
 455            tokenize_data(B, ?INC_CHAR(S, C), Start,
 456                          (Whitespace andalso ?IS_WHITESPACE(C)));
 457        _ ->
 458            Len = O - Start,
 459            <<_:Start/binary, Data:Len/binary, _/binary>> = B,
 460            {{data, Data, Whitespace}, S}
 461    end.
 462
 463tokenize_attributes(B, S) ->
 464    tokenize_attributes(B, S, []).
 465
 466tokenize_attributes(B, S=#decoder{offset=O}, Acc) ->
 467    case B of
 468        <<_:O/binary>> ->
 469            {lists:reverse(Acc), S};
 470        <<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) ->
 471            {lists:reverse(Acc), S};
 472        <<_:O/binary, "?>", _/binary>> ->
 473            {lists:reverse(Acc), S};
 474        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
 475            tokenize_attributes(B, ?INC_CHAR(S, C), Acc);
 476        _ ->
 477            {Attr, S1} = tokenize_literal(B, S),
 478            {Value, S2} = tokenize_attr_value(Attr, B, S1),
 479            tokenize_attributes(B, S2, [{Attr, Value} | Acc])
 480    end.
 481
 482tokenize_attr_value(Attr, B, S) ->
 483    S1 = skip_whitespace(B, S),
 484    O = S1#decoder.offset,
 485    case B of
 486        <<_:O/binary, "=", _/binary>> ->
 487            S2 = skip_whitespace(B, ?INC_COL(S1)),
 488            tokenize_quoted_or_unquoted_attr_value(B, S2);
 489        _ ->
 490            {Attr, S1}
 491    end.
 492    
 493tokenize_quoted_or_unquoted_attr_value(B, S=#decoder{offset=O}) ->
 494    case B of
 495        <<_:O/binary>> ->
 496            { [], S };
 497        <<_:O/binary, Q, _/binary>> when Q =:= ?QUOTE orelse
 498                                         Q =:= ?SQUOTE ->
 499            tokenize_quoted_attr_value(B, ?INC_COL(S), [], Q);
 500        <<_:O/binary, _/binary>> ->
 501            tokenize_unquoted_attr_value(B, S, [])
 502    end.
 503    
 504tokenize_quoted_attr_value(B, S=#decoder{offset=O}, Acc, Q) ->
 505    case B of
 506        <<_:O/binary>> ->
 507            { iolist_to_binary(lists:reverse(Acc)), S };
 508        <<_:O/binary, $&, _/binary>> ->
 509            {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
 510            tokenize_quoted_attr_value(B, S1, [Data|Acc], Q);
 511        <<_:O/binary, Q, _/binary>> ->
 512            { iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S) };
 513        <<_:O/binary, $\n, _/binary>> ->
 514            { iolist_to_binary(lists:reverse(Acc)), ?INC_LINE(S) };
 515        <<_:O/binary, C, _/binary>> ->
 516            tokenize_quoted_attr_value(B, ?INC_COL(S), [C|Acc], Q)
 517    end.
 518    
 519tokenize_unquoted_attr_value(B, S=#decoder{offset=O}, Acc) ->
 520    case B of
 521        <<_:O/binary>> ->
 522            { iolist_to_binary(lists:reverse(Acc)), S };
 523        <<_:O/binary, $&, _/binary>> ->
 524            {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
 525            tokenize_unquoted_attr_value(B, S1, [Data|Acc]);
 526        <<_:O/binary, $/, $>, _/binary>> ->
 527            { iolist_to_binary(lists:reverse(Acc)), S };
 528        <<_:O/binary, C, _/binary>> when ?PROBABLE_CLOSE(C) ->
 529            { iolist_to_binary(lists:reverse(Acc)), S };
 530        <<_:O/binary, C, _/binary>> ->
 531            tokenize_unquoted_attr_value(B, ?INC_COL(S), [C|Acc])
 532    end.   
 533
 534skip_whitespace(B, S=#decoder{offset=O}) ->
 535    case B of
 536        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
 537            skip_whitespace(B, ?INC_CHAR(S, C));
 538        _ ->
 539            S
 540    end.
 541
 542tokenize_literal(Bin, S=#decoder{offset=O}) ->
 543    case Bin of
 544        <<_:O/binary, C, _/binary>> when C =:= $>
 545                                    orelse C =:= $/
 546                                    orelse C =:= $= ->
 547            %% Handle case where tokenize_literal would consume
 548            %% 0 chars. http://github.com/mochi/mochiweb/pull/13
 549            {[C], ?INC_COL(S)};
 550        _ ->
 551            tokenize_literal(Bin, S, [])
 552    end.
 553
 554tokenize_literal(Bin, S=#decoder{offset=O}, Acc) ->
 555    case Bin of
 556        <<_:O/binary, $&, _/binary>> ->
 557            {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
 558            tokenize_literal(Bin, S1, [Data | Acc]);
 559        <<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C)
 560                                              orelse C =:= $>
 561                                              orelse C =:= $/
 562                                              orelse C =:= $=) ->
 563            tokenize_literal(Bin, ?INC_COL(S), [C | Acc]);
 564        _ ->
 565            {iolist_to_binary(string:to_lower(lists:reverse(Acc))), S}
 566    end.
 567
 568raw_qgt(Bin, S=#decoder{offset=O}) ->
 569    raw_qgt(Bin, S, O).
 570
 571raw_qgt(Bin, S=#decoder{offset=O}, Start) ->
 572    case Bin of
 573        <<_:O/binary, "?>", _/binary>> ->
 574            Len = O - Start,
 575            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
 576            {Raw, ?ADV_COL(S, 2)};
 577        <<_:O/binary, C, _/binary>> ->
 578            raw_qgt(Bin, ?INC_CHAR(S, C), Start);
 579        <<_:O/binary>> ->
 580            <<_:Start/binary, Raw/binary>> = Bin,
 581            {Raw, S}
 582    end.
 583
 584find_qgt(Bin, S=#decoder{offset=O}) ->
 585    case Bin of
 586        <<_:O/binary, "?>", _/binary>> ->
 587            ?ADV_COL(S, 2);
 588        <<_:O/binary, ">", _/binary>> ->
 589			?ADV_COL(S, 1);
 590        <<_:O/binary, "/>", _/binary>> ->
 591			?ADV_COL(S, 2);
 592        %% tokenize_attributes takes care of this state:
 593        %% <<_:O/binary, C, _/binary>> ->
 594        %%     find_qgt(Bin, ?INC_CHAR(S, C));
 595        <<_:O/binary>> ->
 596            S
 597    end.
 598
 599find_gt(Bin, S) ->
 600    find_gt(Bin, S, false).
 601
 602find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->
 603    case Bin of
 604        <<_:O/binary, $/, _/binary>> ->
 605            find_gt(Bin, ?INC_COL(S), true);
 606        <<_:O/binary, $>, _/binary>> ->
 607            {?INC_COL(S), HasSlash};
 608        <<_:O/binary, C, _/binary>> ->
 609            find_gt(Bin, ?INC_CHAR(S, C), HasSlash);
 610        _ ->
 611            {S, HasSlash}
 612    end.
 613
 614tokenize_charref(Bin, S=#decoder{offset=O}) ->
 615    tokenize_charref(Bin, S, O).
 616
 617tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
 618    case Bin of
 619        <<_:O/binary>> ->
 620            <<_:Start/binary, Raw/binary>> = Bin,
 621            {{data, Raw, false}, S};
 622        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C)
 623                                         orelse C =:= ?SQUOTE
 624                                         orelse C =:= ?QUOTE
 625                                         orelse C =:= $/
 626                                         orelse C =:= $< 
 627                                         orelse C =:= $> 
 628                                         orelse C =:= $& ->
 629            Len = O - Start,
 630            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
 631	    Data = case mochiweb_charref:charref(Raw) of
 632                       undefined ->
 633                           Start1 = Start - 1,
 634                           Len1 = Len + 1,
 635                           <<_:Start1/binary, R:Len1/binary, _/binary>> = Bin,
 636                           R;
 637                       Unichar ->
 638                           mochiutf8:codepoint_to_bytes(Unichar)
 639                   end,
 640            {{data, Data, false}, S};
 641        <<_:O/binary, $;, _/binary>> ->
 642            Len = O - Start,
 643            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
 644            Data = case mochiweb_charref:charref(Raw) of
 645                       undefined ->
 646                           Start1 = Start - 1,
 647                           Len1 = Len + 2,
 648                           <<_:Start1/binary, R:Len1/binary, _/binary>> = Bin,
 649                           R;
 650                       Unichar ->
 651                           mochiutf8:codepoint_to_bytes(Unichar)
 652                   end,
 653            {{data, Data, false}, ?INC_COL(S)};
 654        _ ->
 655            tokenize_charref(Bin, ?INC_COL(S), Start)
 656    end.
 657
 658tokenize_doctype(Bin, S) ->
 659    tokenize_doctype(Bin, S, []).
 660
 661tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) ->
 662    case Bin of
 663        <<_:O/binary>> ->
 664            {{doctype, lists:reverse(Acc)}, S};
 665        <<_:O/binary, $>, _/binary>> ->
 666            {{doctype, lists:reverse(Acc)}, ?INC_COL(S)};
 667        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
 668            tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc);
 669        _ ->
 670            {Word, S1} = tokenize_word_or_literal(Bin, S),
 671            tokenize_doctype(Bin, S1, [Word | Acc])
 672    end.
 673
 674tokenize_word_or_literal(Bin, S=#decoder{offset=O}) ->
 675    case Bin of
 676        <<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE ->
 677            tokenize_word(Bin, ?INC_COL(S), C);
 678        <<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) ->
 679            %% Sanity check for whitespace
 680            tokenize_literal(Bin, S)
 681    end.
 682
 683tokenize_word(Bin, S, Quote) ->
 684    tokenize_word(Bin, S, Quote, []).
 685
 686tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) ->
 687    case Bin of
 688        <<_:O/binary>> ->
 689            {iolist_to_binary(lists:reverse(Acc)), S};
 690        <<_:O/binary, Quote, _/binary>> ->
 691            {iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)};
 692        <<_:O/binary, $&, _/binary>> ->
 693            {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
 694            tokenize_word(Bin, S1, Quote, [Data | Acc]);
 695        <<_:O/binary, C, _/binary>> ->
 696            tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C | Acc])
 697    end.
 698
 699tokenize_cdata(Bin, S=#decoder{offset=O}) ->
 700    tokenize_cdata(Bin, S, O).
 701
 702tokenize_cdata(Bin, S=#decoder{offset=O}, Start) ->
 703    case Bin of
 704        <<_:O/binary, "]]>", _/binary>> ->
 705            Len = O - Start,
 706            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
 707            {{data, Raw, false}, ?ADV_COL(S, 3)};
 708        <<_:O/binary, C, _/binary>> ->
 709            tokenize_cdata(Bin, ?INC_CHAR(S, C), Start);
 710        _ ->
 711            <<_:O/binary, Raw/binary>> = Bin,
 712            {{data, Raw, false}, S}
 713    end.
 714
 715tokenize_comment(Bin, S=#decoder{offset=O}) ->
 716    tokenize_comment(Bin, S, O).
 717
 718tokenize_comment(Bin, S=#decoder{offset=O}, Start) ->
 719    case Bin of
 720        <<_:O/binary, "-->", _/binary>> ->
 721            Len = O - Start,
 722            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
 723            {{comment, Raw}, ?ADV_COL(S, 3)};
 724        <<_:O/binary, C, _/binary>> ->
 725            tokenize_comment(Bin, ?INC_CHAR(S, C), Start);
 726        <<_:Start/binary, Raw/binary>> ->
 727            {{comment, Raw}, S}
 728    end.
 729
 730tokenize_script(Bin, S=#decoder{offset=O}) ->
 731    tokenize_script(Bin, S, O).
 732
 733tokenize_script(Bin, S=#decoder{offset=O}, Start) ->
 734    case Bin of
 735        %% Just a look-ahead, we want the end_tag separately
 736        <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>>
 737        when (SS =:= $s orelse SS =:= $S) andalso
 738             (CC =:= $c orelse CC =:= $C) andalso
 739             (RR =:= $r orelse RR =:= $R) andalso
 740             (II =:= $i orelse II =:= $I) andalso
 741             (PP =:= $p orelse PP =:= $P) andalso
 742             (TT=:= $t orelse TT =:= $T) andalso
 743             ?PROBABLE_CLOSE(ZZ) ->
 744            Len = O - Start,
 745            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
 746            {{data, Raw, false}, S};
 747        <<_:O/binary, C, _/binary>> ->
 748            tokenize_script(Bin, ?INC_CHAR(S, C), Start);
 749        <<_:Start/binary, Raw/binary>> ->
 750            {{data, Raw, false}, S}
 751    end.
 752
 753tokenize_textarea(Bin, S=#decoder{offset=O}) ->
 754    tokenize_textarea(Bin, S, O).
 755
 756tokenize_textarea(Bin, S=#decoder{offset=O}, Start) ->
 757    case Bin of
 758        %% Just a look-ahead, we want the end_tag separately
 759        <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>>
 760        when (TT =:= $t orelse TT =:= $T) andalso
 761             (EE =:= $e orelse EE =:= $E) andalso
 762             (XX =:= $x orelse XX =:= $X) andalso
 763             (TT2 =:= $t orelse TT2 =:= $T) andalso
 764             (AA =:= $a orelse AA =:= $A) andalso
 765             (RR =:= $r orelse RR =:= $R) andalso
 766             (EE2 =:= $e orelse EE2 =:= $E) andalso
 767             (AA2 =:= $a orelse AA2 =:= $A) andalso
 768             ?PROBABLE_CLOSE(ZZ) ->
 769            Len = O - Start,
 770            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
 771            {{data, Raw, false}, S};
 772        <<_:O/binary, C, _/binary>> ->
 773            tokenize_textarea(Bin, ?INC_CHAR(S, C), Start);
 774        <<_:Start/binary, Raw/binary>> ->
 775            {{data, Raw, false}, S}
 776    end.
 777
 778
 779%%
 780%% Tests
 781%%
 782-ifdef(TEST).
 783-include_lib("eunit/include/eunit.hrl").
 784
 785to_html_test() ->
 786    ?assertEqual(
 787       <<"<html><head><title>hey!</title></head><body><p class=\"foo\">what's up<br /></p><div>sucka</div>RAW!<!-- comment! --></body></html>">>,
 788       iolist_to_binary(
 789         to_html({html, [],
 790                  [{<<"head">>, [],
 791                    [{title, <<"hey!">>}]},
 792                   {body, [],
 793                    [{p, [{class, foo}], [<<"what's">>, <<" up">>, {br}]},
 794                     {'div', <<"sucka">>},
 795                     {'=', <<"RAW!">>},
 796                     {comment, <<" comment! ">>}]}]}))),
 797    ?assertEqual(
 798       <<"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">">>,
 799       iolist_to_binary(
 800         to_html({doctype,
 801                  [<<"html">>, <<"PUBLIC">>,
 802                   <<"-//W3C//DTD XHTML 1.0 Transitional//EN">>,
 803                   <<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">>]}))),
 804    ?assertEqual(
 805       <<"<html><?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?></html>">>,
 806       iolist_to_binary(
 807         to_html({<<"html">>,[],
 808                  [{pi, <<"xml:namespace">>,
 809                    [{<<"prefix">>,<<"o">>},
 810                     {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]}))),
 811    ok.
 812
 813escape_test() ->
 814    ?assertEqual(
 815       <<"&amp;quot;\"word &gt;&lt;&lt;up!&amp;quot;">>,
 816       escape(<<"&quot;\"word ><<up!&quot;">>)),
 817    ?assertEqual(
 818       <<"&amp;quot;\"word &gt;&lt;&lt;up!&amp;quot;">>,
 819       escape("&quot;\"word ><<up!&quot;")),
 820    ?assertEqual(
 821       <<"&amp;quot;\"word &gt;&lt;&lt;up!&amp;quot;">>,
 822       escape('&quot;\"word ><<up!&quot;')),
 823    ok.
 824
 825escape_attr_test() ->
 826    ?assertEqual(
 827       <<"&amp;quot;&quot;word &gt;&lt;&lt;up!&amp;quot;">>,
 828       escape_attr(<<"&quot;\"word ><<up!&quot;">>)),
 829    ?assertEqual(
 830       <<"&amp;quot;&quot;word &gt;&lt;&lt;up!&amp;quot;">>,
 831       escape_attr("&quot;\"word ><<up!&quot;")),
 832    ?assertEqual(
 833       <<"&amp;quot;&quot;word &gt;&lt;&lt;up!&amp;quot;">>,
 834       escape_attr('&quot;\"word ><<up!&quot;')),
 835    ?assertEqual(
 836       <<"12345">>,
 837       escape_attr(12345)),
 838    ?assertEqual(
 839       <<"1.5">>,
 840       escape_attr(1.5)),
 841    ok.
 842
 843tokens_test() ->
 844    ?assertEqual(
 845       [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>},
 846                                {<<"wibble">>, <<"wibble">>},
 847                                {<<"alice">>, <<"bob">>}], true}],
 848       tokens(<<"<foo bar=baz wibble='wibble' alice=\"bob\"/>">>)),
 849    ?assertEqual(
 850       [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>},
 851                                {<<"wibble">>, <<"wibble">>},
 852                                {<<"alice">>, <<"bob">>}], true}],
 853       tokens(<<"<foo bar=baz wibble='wibble' alice=bob/>">>)),
 854    ?assertEqual(
 855       [{comment, <<"[if lt IE 7]>\n<style type=\"text/css\">\n.no_ie { display: none; }\n</style>\n<![endif]">>}],
 856       tokens(<<"<!--[if lt IE 7]>\n<style type=\"text/css\">\n.no_ie { display: none; }\n</style>\n<![endif]-->">>)),
 857    ?assertEqual(
 858       [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
 859        {data, <<" A= B <= C ">>, false},
 860        {end_tag, <<"script">>}],
 861       tokens(<<"<script type=\"text/javascript\"> A= B <= C </script>">>)),
 862    ?assertEqual(
 863       [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
 864        {data, <<" A= B <= C ">>, false},
 865        {end_tag, <<"script">>}],
 866       tokens(<<"<script type =\"text/javascript\"> A= B <= C </script>">>)),
 867    ?assertEqual(
 868       [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
 869        {data, <<" A= B <= C ">>, false},
 870        {end_tag, <<"script">>}],
 871       tokens(<<"<script type = \"text/javascript\"> A= B <= C </script>">>)),
 872    ?assertEqual(
 873       [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
 874        {data, <<" A= B <= C ">>, false},
 875        {end_tag, <<"script">>}],
 876       tokens(<<"<script type= \"text/javascript\"> A= B <= C </script>">>)),
 877    ?assertEqual(
 878       [{start_tag, <<"textarea">>, [], false},
 879        {data, <<"<html></body>">>, false},
 880        {end_tag, <<"textarea">>}],
 881       tokens(<<"<textarea><html></body></textarea>">>)),
 882    ?assertEqual(
 883       [{start_tag, <<"textarea">>, [], false},
 884        {data, <<"<html></body></textareaz>">>, false}],
 885       tokens(<<"<textarea ><html></body></textareaz>">>)),
 886    ?assertEqual(
 887       [{pi, <<"xml:namespace">>,
 888         [{<<"prefix">>,<<"o">>},
 889          {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
 890       tokens(<<"<?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?>">>)),
 891    ?assertEqual(
 892       [{pi, <<"xml:namespace">>,
 893         [{<<"prefix">>,<<"o">>},
 894          {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
 895       tokens(<<"<?xml:namespace prefix=o ns=urn:schemas-microsoft-com:office:office \n?>">>)),
 896    ?assertEqual(
 897       [{pi, <<"xml:namespace">>,
 898         [{<<"prefix">>,<<"o">>},
 899          {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
 900       tokens(<<"<?xml:namespace prefix=o ns=urn:schemas-microsoft-com:office:office">>)),
 901    ?assertEqual(
 902       [{data, <<"<">>, false}],
 903       tokens(<<"&lt;">>)),
 904    ?assertEqual(
 905       [{data, <<"not html ">>, false},
 906        {data, <<"< at all">>, false}],
 907       tokens(<<"not html < at all">>)),
 908    ok.
 909
 910parse_test() ->
 911    D0 = <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">
 912<html>
 913 <head>
 914   <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
 915   <title>Foo</title>
 916   <link rel=\"stylesheet\" type=\"text/css\" href=\"/static/rel/dojo/resources/dojo.css\" media=\"screen\">
 917   <link rel=\"stylesheet\" type=\"text/css\" href=\"/static/foo.css\" media=\"screen\">
 918   <!--[if lt IE 7]>
 919   <style type=\"text/css\">
 920     .no_ie { display: none; }
 921   </style>
 922   <![endif]-->
 923   <link rel=\"icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
 924   <link rel=\"shortcut icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
 925 </head>
 926 <body id=\"home\" class=\"tundra\"><![CDATA[&lt;<this<!-- is -->CDATA>&gt;]]></body>
 927</html>">>,
 928    ?assertEqual(
 929       {<<"html">>, [],
 930        [{<<"head">>, [],
 931          [{<<"meta">>,
 932            [{<<"http-equiv">>,<<"Content-Type">>},
 933             {<<"content">>,<<"text/html; charset=UTF-8">>}],
 934            []},
 935           {<<"title">>,[],[<<"Foo">>]},
 936           {<<"link">>,
 937            [{<<"rel">>,<<"stylesheet">>},
 938             {<<"type">>,<<"text/css">>},
 939             {<<"href">>,<<"/static/rel/dojo/resources/dojo.css">>},
 940             {<<"media">>,<<"screen">>}],
 941            []},
 942           {<<"link">>,
 943            [{<<"rel">>,<<"stylesheet">>},
 944             {<<"type">>,<<"text/css">>},
 945             {<<"href">>,<<"/static/foo.css">>},
 946             {<<"media">>,<<"screen">>}],
 947            []},
 948           {comment,<<"[if lt IE 7]>\n   <style type=\"text/css\">\n     .no_ie { display: none; }\n   </style>\n   <![endif]">>},
 949           {<<"link">>,
 950            [{<<"rel">>,<<"icon">>},
 951             {<<"href">>,<<"/static/images/favicon.ico">>},
 952             {<<"type">>,<<"image/x-icon">>}],
 953            []},
 954           {<<"link">>,
 955            [{<<"rel">>,<<"shortcut icon">>},
 956             {<<"href">>,<<"/static/images/favicon.ico">>},
 957             {<<"type">>,<<"image/x-icon">>}],
 958            []}]},
 959         {<<"body">>,
 960          [{<<"id">>,<<"home">>},
 961           {<<"class">>,<<"tundra">>}],
 962          [<<"&lt;<this<!-- is -->CDATA>&gt;">>]}]},
 963       parse(D0)),
 964    ?assertEqual(
 965       {<<"html">>,[],
 966        [{pi, <<"xml:namespace">>,
 967          [{<<"prefix">>,<<"o">>},
 968           {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]},
 969       parse(
 970         <<"<html><?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?></html>">>)),
 971    ?assertEqual(
 972       {<<"html">>, [],
 973        [{<<"dd">>, [], [<<"foo">>]},
 974         {<<"dt">>, [], [<<"bar">>]}]},
 975       parse(<<"<html><dd>foo<dt>bar</html>">>)),
 976    %% Singleton sadness
 977    ?assertEqual(
 978       {<<"html">>, [],
 979        [{<<"link">>, [], []},
 980         <<"foo">>,
 981         {<<"br">>, [], []},
 982         <<"bar">>]},
 983       parse(<<"<html><link>foo<br>bar</html>">>)),
 984    ?assertEqual(
 985       {<<"html">>, [],
 986        [{<<"link">>, [], [<<"foo">>,
 987                           {<<"br">>, [], []},
 988                           <<"bar">>]}]},
 989       parse(<<"<html><link>foo<br>bar</link></html>">>)),
 990    %% Case insensitive tags
 991    ?assertEqual(
 992       {<<"html">>, [],
 993        [{<<"head">>, [], [<<"foo">>,
 994                           {<<"br">>, [], []},
 995                           <<"BAR">>]},
 996         {<<"body">>, [{<<"class">>, <<"">>}, {<<"bgcolor">>, <<"#Aa01fF">>}], []}
 997        ]},
 998       parse(<<"<html><Head>foo<bR>BAR</head><body Class=\"\" bgcolor=\"#Aa01fF\"></BODY></html>">>)),
 999    ok.
1000
1001exhaustive_is_singleton_test() ->
1002    T = mochiweb_cover:clause_lookup_table(?MODULE, is_singleton),
1003    [?assertEqual(V, is_singleton(K)) || {K, V} <- T].
1004
1005tokenize_attributes_test() ->
1006    ?assertEqual(
1007       {<<"foo">>,
1008        [{<<"bar">>, <<"b\"az">>},
1009         {<<"wibble">>, <<"wibble">>},
1010         {<<"taco", 16#c2, 16#a9>>, <<"bell">>},
1011         {<<"quux">>, <<"quux">>}],
1012        []},
1013       parse(<<"<foo bar=\"b&quot;az\" wibble taco&copy;=bell quux">>)),
1014    ok.
1015
1016tokens2_test() ->
1017    D0 = <<"<channel><title>from __future__ import *</title><link>http://bob.pythonmac.org</link><description>Bob's Rants</description></channel>">>,
1018    ?assertEqual(
1019       [{start_tag,<<"channel">>,[],false},
1020        {start_tag,<<"title">>,[],false},
1021        {data,<<"from __future__ import *">>,false},
1022        {end_tag,<<"title">>},
1023        {start_tag,<<"link">>,[],true},
1024        {data,<<"http://bob.pythonmac.org">>,false},
1025        {end_tag,<<"link">>},
1026        {start_tag,<<"description">>,[],false},
1027        {data,<<"Bob's Rants">>,false},
1028        {end_tag,<<"description">>},
1029        {end_tag,<<"channel">>}],
1030       tokens(D0)),
1031    ok.
1032
1033to_tokens_test() ->
1034    ?assertEqual(
1035       [{start_tag, <<"p">>, [{class, 1}], false},
1036        {end_tag, <<"p">>}],
1037       to_tokens({p, [{class, 1}], []})),
1038    ?assertEqual(
1039       [{start_tag, <<"p">>, [], false},
1040        {end_tag, <<"p">>}],
1041       to_tokens({p})),
1042    ?assertEqual(
1043       [{'=', <<"data">>}],
1044       to_tokens({'=', <<"data">>})),
1045    ?assertEqual(
1046       [{comment, <<"comment">>}],
1047       to_tokens({comment, <<"comment">>})),
1048    %% This is only allowed in sub-tags:
1049    %% {p, [{"class", "foo"}]} as {p, [{"class", "foo"}], []}
1050    %% On the outside it's always treated as follows:
1051    %% {p, [], [{"class", "foo"}]} as {p, [], [{"class", "foo"}]}
1052    ?assertEqual(
1053       [{start_tag, <<"html">>, [], false},
1054        {start_tag, <<"p">>, [{class, 1}], false},
1055        {end_tag, <<"p">>},
1056        {end_tag, <<"html">>}],
1057       to_tokens({html, [{p, [{class, 1}]}]})),
1058    ok.
1059
1060parse2_test() ->
1061    D0 = <<"<channel><title>from __future__ import *</title><link>http://bob.pythonmac.org<br>foo</link><description>Bob's Rants</description></channel>">>,
1062    ?assertEqual(
1063       {<<"channel">>,[],
1064        [{<<"title">>,[],[<<"from __future__ import *">>]},
1065         {<<"link">>,[],[
1066                         <<"http://bob.pythonmac.org">>,
1067                         {<<"br">>,[],[]},
1068                         <<"foo">>]},
1069         {<<"description">>,[],[<<"Bob's Rants">>]}]},
1070       parse(D0)),
1071    ok.
1072
1073parse_tokens_test() ->
1074    D0 = [{doctype,[<<"HTML">>,<<"PUBLIC">>,<<"-//W3C//DTD HTML 4.01 Transitional//EN">>]},
1075          {data,<<"\n">>,true},
1076          {start_tag,<<"html">>,[],false}],
1077    ?assertEqual(
1078       {<<"html">>, [], []},
1079       parse_tokens(D0)),
1080    D1 = D0 ++ [{end_tag, <<"html">>}],
1081    ?assertEqual(
1082       {<<"html">>, [], []},
1083       parse_tokens(D1)),
1084    D2 = D0 ++ [{start_tag, <<"body">>, [], false}],
1085    ?assertEqual(
1086       {<<"html">>, [], [{<<"body">>, [], []}]},
1087       parse_tokens(D2)),
1088    D3 = D0 ++ [{start_tag, <<"head">>, [], false},
1089                {end_tag, <<"head">>},
1090                {start_tag, <<"body">>, [], false}],
1091    ?assertEqual(
1092       {<<"html">>, [], [{<<"head">>, [], []}, {<<"body">>, [], []}]},
1093       parse_tokens(D3)),
1094    D4 = D3 ++ [{data,<<"\n">>,true},
1095                {start_tag,<<"div">>,[{<<"class">>,<<"a">>}],false},
1096                {start_tag,<<"a">>,[{<<"name">>,<<"#anchor">>}],false},
1097                {end_tag,<<"a">>},
1098                {end_tag,<<"div">>},
1099                {start_tag,<<"div">>,[{<<"class">>,<<"b">>}],false},
1100                {start_tag,<<"div">>,[{<<"class">>,<<"c">>}],false},
1101                {end_tag,<<"div">>},
1102                {end_tag,<<"div">>}],
1103    ?assertEqual(
1104       {<<"html">>, [],
1105        [{<<"head">>, [], []},
1106         {<<"body">>, [],
1107          [{<<"div">>, [{<<"class">>, <<"a">>}], [{<<"a">>, [{<<"name">>, <<"#anchor">>}], []}]},
1108           {<<"div">>, [{<<"class">>, <<"b">>}], [{<<"div">>, [{<<"class">>, <<"c">>}], []}]}
1109          ]}]},
1110       parse_tokens(D4)),
1111    D5 = [{start_tag,<<"html">>,[],false},
1112          {data,<<"\n">>,true},
1113          {data,<<"boo">>,false},
1114          {data,<<"hoo">>,false},
1115          {data,<<"\n">>,true},
1116          {end_tag,<<"html">>}],
1117    ?assertEqual(
1118       {<<"html">>, [], [<<"\nboohoo\n">>]},
1119       parse_tokens(D5)),
1120    D6 = [{start_tag,<<"html">>,[],false},
1121          {data,<<"\n">>,true},
1122          {data,<<"\n">>,true},
1123          {end_tag,<<"html">>}],
1124    ?assertEqual(
1125       {<<"html">>, [], []},
1126       parse_tokens(D6)),
1127    D7 = [{start_tag,<<"html">>,[],false},
1128          {start_tag,<<"ul">>,[],false},
1129          {start_tag,<<"li">>,[],false},
1130          {data,<<"word">>,false},
1131          {start_tag,<<"li">>,[],false},
1132          {data,<<"up">>,false},
1133          {end_tag,<<"li">>},
1134          {start_tag,<<"li">>,[],false},
1135          {data,<<"fdsa">>,false},
1136          {start_tag,<<"br">>,[],true},
1137          {data,<<"asdf">>,false},
1138          {end_tag,<<"ul">>},
1139          {end_tag,<<"html">>}],
1140    ?assertEqual(
1141       {<<"html">>, [],
1142        [{<<"ul">>, [],
1143          [{<<"li">>, [], [<<"word">>]},
1144           {<<"li">>, [], [<<"up">>]},
1145           {<<"li">>, [], [<<"fdsa">>,{<<"br">>, [], []}, <<"asdf">>]}]}]},
1146       parse_tokens(D7)),
1147    ok.
1148
1149destack_test() ->
1150    {<<"a">>, [], []} =
1151        destack([{<<"a">>, [], []}]),
1152    {<<"a">>, [], [{<<"b">>, [], []}]} =
1153        destack([{<<"b">>, [], []}, {<<"a">>, [], []}]),
1154    {<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]} =
1155     destack([{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
1156    [{<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]}] =
1157     destack(<<"b">>,
1158             [{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
1159    [{<<"b">>, [], [{<<"c">>, [], []}]}, {<<"a">>, [], []}] =
1160     destack(<<"c">>,
1161             [{<<"c">>, [], []}, {<<"b">>, [], []},{<<"a">>, [], []}]),
1162    ok.
1163
1164doctype_test() ->
1165    ?assertEqual(
1166       {<<"html">>,[],[{<<"head">>,[],[]}]},
1167       mochiweb_html:parse("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"
1168                           "<html><head></head></body></html>")),
1169    %% http://code.google.com/p/mochiweb/issues/detail?id=52
1170    ?assertEqual(
1171       {<<"html">>,[],[{<<"head">>,[],[]}]},
1172       mochiweb_html:parse("<html>"
1173                           "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"
1174                           "<head></head></body></html>")),
1175    %% http://github.com/mochi/mochiweb/pull/13
1176    ?assertEqual(
1177       {<<"html">>,[],[{<<"head">>,[],[]}]},
1178       mochiweb_html:parse("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\"/>"
1179                           "<html>"
1180                           "<head></head></body></html>")),
1181    ok.
1182
1183dumb_br_test() ->
1184    %% http://code.google.com/p/mochiweb/issues/detail?id=71
1185    ?assertEqual(
1186       {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>]},
1187       mochiweb_html:parse("<div><br/><br/>z</br/></br/></div>")),
1188    ?assertEqual(
1189       {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>]},
1190       mochiweb_html:parse("<div><br><br>z</br/></br/></div>")),
1191    ?assertEqual(
1192       {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>, {<<"br">>, [], []}, {<<"br">>, [], []}]},
1193       mochiweb_html:parse("<div><br><br>z<br/><br/></div>")),
1194    ?assertEqual(
1195       {<<"div">>,[],[{<<"br">>, [], []}, {<<"br">>, [], []}, <<"z">>]},
1196       mochiweb_html:parse("<div><br><br>z</br></br></div>")).
1197
1198
1199php_test() ->
1200    %% http://code.google.com/p/mochiweb/issues/detail?id=71
1201    ?assertEqual(
1202       [{pi, <<"php\n">>}],
1203       mochiweb_html:tokens(
1204         "<?php\n?>")),
1205    ?assertEqual(
1206       {<<"div">>, [], [{pi, <<"php\n">>}]},
1207       mochiweb_html:parse(
1208         "<div><?php\n?></div>")),
1209    ok.
1210
1211parse_unquoted_attr_test() ->
1212    D0 = <<"<html><img src=/images/icon.png/></html>">>,
1213    ?assertEqual(
1214        {<<"html">>,[],[
1215            { <<"img">>, [ { <<"src">>, <<"/images/icon.png">> } ], [] }
1216        ]},
1217        mochiweb_html:parse(D0)),
1218    
1219    D1 = <<"<html><img src=/images/icon.png></img></html>">>,
1220        ?assertEqual(
1221            {<<"html">>,[],[
1222                { <<"img">>, [ { <<"src">>, <<"/images/icon.png">> } ], [] }
1223            ]},
1224            mochiweb_html:parse(D1)),
1225    
1226    D2 = <<"<html><img src=/images/icon&gt;.png width=100></img></html>">>,
1227        ?assertEqual(
1228            {<<"html">>,[],[
1229                { <<"img">>, [ { <<"src">>, <<"/images/icon>.png">> }, { <<"width">>, <<"100">> } ], [] }
1230            ]},
1231            mochiweb_html:parse(D2)),
1232    ok.        
1233    
1234parse_quoted_attr_test() ->    
1235    D0 = <<"<html><img src='/images/icon.png'></html>">>,
1236    ?assertEqual(
1237        {<<"html">>,[],[
1238            { <<"img">>, [ { <<"src">>, <<"/images/icon.png">> } ], [] }
1239        ]},
1240        mochiweb_html:parse(D0)),     
1241        
1242    D1 = <<"<html><img src=\"/images/icon.png'></html>">>,
1243    ?assertEqual(
1244        {<<"html">>,[],[
1245            { <<"img">>, [ { <<"src">>, <<"/images/icon.png'></html>">> } ], [] }
1246        ]},
1247        mochiweb_html:parse(D1)),     
1248
1249    D2 = <<"<html><img src=\"/images/icon&gt;.png\"></html>">>,
1250    ?assertEqual(
1251        {<<"html">>,[],[
1252            { <<"img">>, [ { <<"src">>, <<"/images/icon>.png">> } ], [] }
1253        ]},
1254        mochiweb_html:parse(D2)),     
1255    ok.
1256
1257parse_missing_attr_name_test() ->
1258    D0 = <<"<html =black></html>">>,
1259    ?assertEqual(
1260        {<<"html">>, [ { <<"=">>, <<"=">> }, { <<"black">>, <<"black">> } ], [] },
1261       mochiweb_html:parse(D0)),
1262    ok.
1263
1264parse_amps_attr_test() ->
1265    D0 = <<"<a href=\"/hello?test=1&amp;that=2\"></a>">>,
1266    ?assertEqual(
1267       {<<"a">>, [ { <<"href">>, <<"/hello?test=1&that=2">> }], [] },
1268       mochiweb_html:parse(D0)),
1269    
1270    D1 = <<"<a href=\"/hello?test=1&that=2\"></a>">>,
1271    ?assertEqual(
1272       {<<"a">>, [ { <<"href">>, <<"/hello?test=1&that=2">> }], [] },
1273       mochiweb_html:parse(D1)),
1274
1275    D2 = <<"<a href=\"/hello?test=123&that=2&amp;this=too\"></a>">>,
1276    ?assertEqual(
1277       {<<"a">>, [ { <<"href">>, <<"/hello?test=123&that=2&this=too">> }], [] },
1278       mochiweb_html:parse(D2)),
1279
1280    D3 = <<"<a href=\"/product/54?c=hk-machine&id=1008&shop=auto-oko-74-H\"></a>">>,
1281    ?assertEqual(
1282       {<<"a">>, [ { <<"href">>, <<"/product/54?c=hk-machine&id=1008&shop=auto-oko-74-H">> }], [] },
1283       mochiweb_html:parse(D3)),
1284
1285    D4 = <<"<a href=\"test?a=1&amp=1008\"></a>">>,
1286    ?assertEqual(
1287       {<<"a">>, [ { <<"href">>, <<"test?a=1&amp=1008">> }], [] },
1288       mochiweb_html:parse(D4)),
1289    
1290    ok.
1291
1292parse_broken_pi_test() ->
1293	D0 = <<"<html><?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" /></html>">>,
1294	?assertEqual(
1295		{<<"html">>, [], [
1296			{ pi, <<"xml:namespace">>, [ { <<"prefix">>, <<"o">> }, 
1297			                             { <<"ns">>, <<"urn:schemas-microsoft-com:office:office">> } ] }
1298		] },
1299		mochiweb_html:parse(D0)),
1300	ok.
1301
1302parse_funny_singletons_test() ->
1303	D0 = <<"<html><input><input>x</input></input></html>">>,
1304	?assertEqual(
1305		{<<"html">>, [], [
1306			{ <<"input">>, [], [] },
1307			{ <<"input">>, [], [ <<"x">> ] }
1308		] },
1309		mochiweb_html:parse(D0)),
1310	ok.
1311
1312parse_charref_test() ->
1313    %% Normal charref
1314    D0 = <<"<div>&amp;</div>">>,
1315    ?assertEqual(
1316       {<<"div">>, [], [<<"&">>]},
1317       mochiweb_html:parse(D0)),
1318
1319    %% Missing semicolon in the middle. 
1320    D1 = <<"<div>&amp &amp;</div>">>,
1321    ?assertEqual(
1322       {<<"div">>, [], [<<"& &">>]},
1323       mochiweb_html:parse(D1)),
1324
1325    %% Missing semicolon on the last enitity
1326    D2 = <<"<div>&amp &amp</div>">>,
1327    ?assertEqual(
1328       {<<"div">>, [], [<<"& &">>]},
1329       mochiweb_html:parse(D2)),
1330
1331    D3 = <<"<div>&amp&amp</div>">>,
1332    ?assertEqual(
1333       {<<"div">>, [], [<<"&&">>]},
1334       mochiweb_html:parse(D3)),
1335
1336    D4 = <<"<div>&amp</div>">>,
1337    ?assertEqual(
1338       {<<"div">>, [], [<<"&">>]},
1339       mochiweb_html:parse(D4)),
1340
1341    ok.
1342
1343parse_charref_garbage_in_garbage_out_test() ->
1344    %% faulty charref is left alone
1345    D1 = <<"<div>&amp. test</div>">>,
1346    ?assert

Large files files are truncated, but you can click here to view the full file