PageRenderTime 93ms CodeModel.GetById 10ms app.highlight 75ms RepoModel.GetById 1ms app.codeStats 1ms

/src/mochiweb_html.erl

http://github.com/basho/mochiweb
Erlang | 792 lines | 649 code | 64 blank | 79 comment | 1 complexity | b945da9d1ef5644e84e7bdaf30a7033a MD5 | raw file
  1%% @author Bob Ippolito <bob@mochimedia.com>
  2%% @copyright 2007 Mochi Media, Inc.
  3%%
  4%% Permission is hereby granted, free of charge, to any person obtaining a
  5%% copy of this software and associated documentation files (the "Software"),
  6%% to deal in the Software without restriction, including without limitation
  7%% the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8%% and/or sell copies of the Software, and to permit persons to whom the
  9%% Software is furnished to do so, subject to the following conditions:
 10%%
 11%% The above copyright notice and this permission notice shall be included in
 12%% all copies or substantial portions of the Software.
 13%%
 14%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 17%% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19%% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20%% DEALINGS IN THE SOFTWARE.
 21
 22%% @doc Loosely tokenizes and generates parse trees for HTML 4.
 23-module(mochiweb_html).
 24-export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1,
 25         escape_attr/1, to_html/1]).
 26-compile([export_all]).
 27-ifdef(TEST).
 28-export([destack/1, destack/2, is_singleton/1]).
 29-endif.
 30
 31%% This is a macro to placate syntax highlighters..
 32-define(QUOTE, $\"). %% $\"
 33-define(SQUOTE, $\'). %% $\'
 34-define(ADV_COL(S, N),
 35        S#decoder{column=N+S#decoder.column,
 36                  offset=N+S#decoder.offset}).
 37-define(INC_COL(S),
 38        S#decoder{column=1+S#decoder.column,
 39                  offset=1+S#decoder.offset}).
 40-define(INC_LINE(S),
 41        S#decoder{column=1,
 42                  line=1+S#decoder.line,
 43                  offset=1+S#decoder.offset}).
 44-define(INC_CHAR(S, C),
 45        case C of
 46            $\n ->
 47                S#decoder{column=1,
 48                          line=1+S#decoder.line,
 49                          offset=1+S#decoder.offset};
 50            _ ->
 51                S#decoder{column=1+S#decoder.column,
 52                          offset=1+S#decoder.offset}
 53        end).
 54
 55-define(IS_WHITESPACE(C),
 56        (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
 57-define(IS_LITERAL_SAFE(C),
 58        ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)
 59         orelse (C >= $0 andalso C =< $9))).
 60-define(PROBABLE_CLOSE(C),
 61        (C =:= $> orelse ?IS_WHITESPACE(C))).
 62
 63-record(decoder, {line=1,
 64                  column=1,
 65                  offset=0}).
 66
 67%% @type html_node() = {string(), [html_attr()], [html_node() | string()]}
 68%% @type html_attr() = {string(), string()}
 69%% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype()
 70%% @type html_data() = {data, string(), Whitespace::boolean()}
 71%% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()}
 72%% @type end_tag() = {end_tag, Name}
 73%% @type html_comment() = {comment, Comment}
 74%% @type html_doctype() = {doctype, [Doctype]}
 75%% @type inline_html() = {'=', iolist()}
 76
 77%% External API.
 78
 79%% @spec parse(string() | binary()) -> html_node()
 80%% @doc tokenize and then transform the token stream into a HTML tree.
 81parse(Input) ->
 82    parse_tokens(tokens(Input)).
 83
 84%% @spec parse_tokens([html_token()]) -> html_node()
 85%% @doc Transform the output of tokens(Doc) into a HTML tree.
 86parse_tokens(Tokens) when is_list(Tokens) ->
 87    %% Skip over doctype, processing instructions
 88    [{start_tag, Tag, Attrs, false} | Rest] = find_document(Tokens, normal),
 89    {Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
 90    Tree.
 91
 92find_document(Tokens=[{start_tag, _Tag, _Attrs, false} | _Rest], Mode) ->
 93    maybe_add_html_tag(Tokens, Mode);
 94find_document([{doctype, [<<"html">>]} | Rest], _Mode) ->
 95    find_document(Rest, html5);
 96find_document([_T | Rest], Mode) ->
 97    find_document(Rest, Mode);
 98find_document([], _Mode) ->
 99    [].
100
101maybe_add_html_tag(Tokens=[{start_tag, Tag, _Attrs, false} | _], html5)
102  when Tag =/= <<"html">> ->
103    [{start_tag, <<"html">>, [], false} | Tokens];
104maybe_add_html_tag(Tokens, _Mode) ->
105    Tokens.
106
107%% @spec tokens(StringOrBinary) -> [html_token()]
108%% @doc Transform the input UTF-8 HTML into a token stream.
109tokens(Input) ->
110    tokens(iolist_to_binary(Input), #decoder{}, []).
111
112%% @spec to_tokens(html_node()) -> [html_token()]
113%% @doc Convert a html_node() tree to a list of tokens.
114to_tokens({Tag0}) ->
115    to_tokens({Tag0, [], []});
116to_tokens(T={'=', _}) ->
117    [T];
118to_tokens(T={doctype, _}) ->
119    [T];
120to_tokens(T={comment, _}) ->
121    [T];
122to_tokens({Tag0, Acc}) ->
123    %% This is only allowed in sub-tags: {p, [{"class", "foo"}]}
124    to_tokens({Tag0, [], Acc});
125to_tokens({Tag0, Attrs, Acc}) ->
126    Tag = to_tag(Tag0),
127    case is_singleton(Tag) of
128        true ->
129            to_tokens([], [{start_tag, Tag, Attrs, true}]);
130        false ->
131            to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, false}])
132    end.
133
134%% @spec to_html([html_token()] | html_node()) -> iolist()
135%% @doc Convert a list of html_token() to a HTML document.
136to_html(Node) when is_tuple(Node) ->
137    to_html(to_tokens(Node));
138to_html(Tokens) when is_list(Tokens) ->
139    to_html(Tokens, []).
140
141%% @spec escape(string() | atom() | binary()) -> binary()
142%% @doc Escape a string such that it's safe for HTML (amp; lt; gt;).
143escape(B) when is_binary(B) ->
144    escape(binary_to_list(B), []);
145escape(A) when is_atom(A) ->
146    escape(atom_to_list(A), []);
147escape(S) when is_list(S) ->
148    escape(S, []).
149
150%% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary()
151%% @doc Escape a string such that it's safe for HTML attrs
152%%      (amp; lt; gt; quot;).
153escape_attr(B) when is_binary(B) ->
154    escape_attr(binary_to_list(B), []);
155escape_attr(A) when is_atom(A) ->
156    escape_attr(atom_to_list(A), []);
157escape_attr(S) when is_list(S) ->
158    escape_attr(S, []);
159escape_attr(I) when is_integer(I) ->
160    escape_attr(integer_to_list(I), []);
161escape_attr(F) when is_float(F) ->
162    escape_attr(mochinum:digits(F), []).
163
164to_html([], Acc) ->
165    lists:reverse(Acc);
166to_html([{'=', Content} | Rest], Acc) ->
167    to_html(Rest, [Content | Acc]);
168to_html([{pi, Bin} | Rest], Acc) ->
169    Open = [<<"<?">>,
170            Bin,
171            <<"?>">>],
172    to_html(Rest, [Open | Acc]);
173to_html([{pi, Tag, Attrs} | Rest], Acc) ->
174    Open = [<<"<?">>,
175            Tag,
176            attrs_to_html(Attrs, []),
177            <<"?>">>],
178    to_html(Rest, [Open | Acc]);
179to_html([{comment, Comment} | Rest], Acc) ->
180    to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] | Acc]);
181to_html([{doctype, Parts} | Rest], Acc) ->
182    Inside = doctype_to_html(Parts, Acc),
183    to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] | Acc]);
184to_html([{data, Data, _Whitespace} | Rest], Acc) ->
185    to_html(Rest, [escape(Data) | Acc]);
186to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc) ->
187    Open = [<<"<">>,
188            Tag,
189            attrs_to_html(Attrs, []),
190            case Singleton of
191                true -> <<" />">>;
192                false -> <<">">>
193            end],
194    to_html(Rest, [Open | Acc]);
195to_html([{end_tag, Tag} | Rest], Acc) ->
196    to_html(Rest, [[<<"</">>, Tag, <<">">>] | Acc]).
197
198doctype_to_html([], Acc) ->
199    lists:reverse(Acc);
200doctype_to_html([Word | Rest], Acc) ->
201    case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end,
202                   binary_to_list(iolist_to_binary(Word))) of
203        true ->
204            doctype_to_html(Rest, [[<<" ">>, Word] | Acc]);
205        false ->
206            doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc])
207    end.
208
209attrs_to_html([], Acc) ->
210    lists:reverse(Acc);
211attrs_to_html([{K, V} | Rest], Acc) ->
212    attrs_to_html(Rest,
213                  [[<<" ">>, escape(K), <<"=\"">>,
214                    escape_attr(V), <<"\"">>] | Acc]).
215
216escape([], Acc) ->
217    list_to_binary(lists:reverse(Acc));
218escape("<" ++ Rest, Acc) ->
219    escape(Rest, lists:reverse("&lt;", Acc));
220escape(">" ++ Rest, Acc) ->
221    escape(Rest, lists:reverse("&gt;", Acc));
222escape("&" ++ Rest, Acc) ->
223    escape(Rest, lists:reverse("&amp;", Acc));
224escape([C | Rest], Acc) ->
225    escape(Rest, [C | Acc]).
226
227escape_attr([], Acc) ->
228    list_to_binary(lists:reverse(Acc));
229escape_attr("<" ++ Rest, Acc) ->
230    escape_attr(Rest, lists:reverse("&lt;", Acc));
231escape_attr(">" ++ Rest, Acc) ->
232    escape_attr(Rest, lists:reverse("&gt;", Acc));
233escape_attr("&" ++ Rest, Acc) ->
234    escape_attr(Rest, lists:reverse("&amp;", Acc));
235escape_attr([?QUOTE | Rest], Acc) ->
236    escape_attr(Rest, lists:reverse("&quot;", Acc));
237escape_attr([C | Rest], Acc) ->
238    escape_attr(Rest, [C | Acc]).
239
240to_tag(A) when is_atom(A) ->
241    norm(atom_to_list(A));
242to_tag(L) ->
243    norm(L).
244
245to_tokens([], Acc) ->
246    lists:reverse(Acc);
247to_tokens([{Tag, []} | Rest], Acc) ->
248    to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]);
249to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) ->
250    %% Allow {br}
251    to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc);
252to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) ->
253    %% Allow {'=', iolist()}
254    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
255to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) ->
256    %% Allow {comment, iolist()}
257    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
258to_tokens([{Tag0, [T0={pi, _S0} | R1]} | Rest], Acc) ->
259    %% Allow {pi, binary()}
260    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
261to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) ->
262    %% Allow {pi, binary(), list()}
263    to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
264to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) ->
265    %% Allow {p, [{"class", "foo"}]}
266    to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc);
267to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) ->
268    %% Allow {p, "content"} and {p, <<"content">>}
269    to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc);
270to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) ->
271    %% Allow {"p", [{"class", "foo"}], <<"content">>}
272    to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc);
273to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc)
274  when is_integer(C) ->
275    %% Allow {"p", [{"class", "foo"}], "content"}
276    to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc);
277to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) ->
278    %% Native {"p", [{"class", "foo"}], ["content"]}
279    Tag = to_tag(Tag0),
280    T1 = to_tag(T0),
281    case is_singleton(norm(T1)) of
282        true ->
283            to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]);
284        false ->
285            to_tokens([{T1, C1}, {Tag, R1} | Rest],
286                      [{start_tag, T1, A1, false} | Acc])
287    end;
288to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) ->
289    %% List text
290    Tag = to_tag(Tag0),
291    to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]);
292to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) ->
293    %% Binary text
294    Tag = to_tag(Tag0),
295    to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]).
296
297tokens(B, S=#decoder{offset=O}, Acc) ->
298    case B of
299        <<_:O/binary>> ->
300            lists:reverse(Acc);
301        _ ->
302            {Tag, S1} = tokenize(B, S),
303            case parse_flag(Tag) of
304                script ->
305                    {Tag2, S2} = tokenize_script(B, S1),
306                    tokens(B, S2, [Tag2, Tag | Acc]);
307                textarea ->
308                    {Tag2, S2} = tokenize_textarea(B, S1),
309                    tokens(B, S2, [Tag2, Tag | Acc]);
310                none ->
311                    tokens(B, S1, [Tag | Acc])
312            end
313    end.
314
315parse_flag({start_tag, B, _, false}) ->
316    case string:to_lower(binary_to_list(B)) of
317        "script" ->
318            script;
319        "textarea" ->
320            textarea;
321        _ ->
322            none
323    end;
324parse_flag(_) ->
325    none.
326
327tokenize(B, S=#decoder{offset=O}) ->
328    case B of
329        <<_:O/binary, "<!--", _/binary>> ->
330            tokenize_comment(B, ?ADV_COL(S, 4));
331        <<_:O/binary, "<!doctype", _/binary>> ->
332            tokenize_doctype(B, ?ADV_COL(S, 10));
333        <<_:O/binary, "<!DOCTYPE", _/binary>> ->
334            tokenize_doctype(B, ?ADV_COL(S, 10));
335        <<_:O/binary, "<![CDATA[", _/binary>> ->
336            tokenize_cdata(B, ?ADV_COL(S, 9));
337        <<_:O/binary, "<?php", _/binary>> ->
338            {Body, S1} = raw_qgt(B, ?ADV_COL(S, 2)),
339            {{pi, Body}, S1};
340        <<_:O/binary, "<?", _/binary>> ->
341            {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
342            {Attrs, S2} = tokenize_attributes(B, S1),
343            S3 = find_qgt(B, S2),
344            {{pi, Tag, Attrs}, S3};
345        <<_:O/binary, "&", _/binary>> ->
346            tokenize_charref(B, ?INC_COL(S));
347        <<_:O/binary, "</", _/binary>> ->
348            {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
349            {S2, _} = find_gt(B, S1),
350            {{end_tag, Tag}, S2};
351        <<_:O/binary, "<", C, _/binary>>
352                when ?IS_WHITESPACE(C); not ?IS_LITERAL_SAFE(C) ->
353            %% This isn't really strict HTML
354            {{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)),
355            {{data, <<$<, Data/binary>>, false}, S1};
356        <<_:O/binary, "<", _/binary>> ->
357            {Tag, S1} = tokenize_literal(B, ?INC_COL(S)),
358            {Attrs, S2} = tokenize_attributes(B, S1),
359            {S3, HasSlash} = find_gt(B, S2),
360            Singleton = HasSlash orelse is_singleton(Tag),
361            {{start_tag, Tag, Attrs, Singleton}, S3};
362        _ ->
363            tokenize_data(B, S)
364    end.
365
366tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
367    tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]);
368tree_data(Rest, AllWhitespace, Acc) ->
369    {iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.
370
371tree([], Stack) ->
372    {destack(Stack), []};
373tree([{end_tag, Tag} | Rest], Stack) ->
374    case destack(norm(Tag), Stack) of
375        S when is_list(S) ->
376            tree(Rest, S);
377        Result ->
378            {Result, []}
379    end;
380tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
381    tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
382tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
383    tree(Rest, stack(norm({Tag, Attrs}), S));
384tree([T={pi, _Raw} | Rest], S) ->
385    tree(Rest, append_stack_child(T, S));
386tree([T={pi, _Tag, _Attrs} | Rest], S) ->
387    tree(Rest, append_stack_child(T, S));
388tree([T={comment, _Comment} | Rest], S) ->
389    tree(Rest, append_stack_child(T, S));
390tree(L=[{data, _Data, _Whitespace} | _], S) ->
391    case tree_data(L, true, []) of
392        {_, true, Rest} ->
393            tree(Rest, S);
394        {Data, false, Rest} ->
395            tree(Rest, append_stack_child(Data, S))
396    end;
397tree([{doctype, _} | Rest], Stack) ->
398    tree(Rest, Stack).
399
400norm({Tag, Attrs}) ->
401    {norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
402norm(Tag) when is_binary(Tag) ->
403    Tag;
404norm(Tag) ->
405    list_to_binary(string:to_lower(Tag)).
406
407stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest])
408  when TN =:= <<"li">> orelse TN =:= <<"option">> ->
409    [T1 | destack(TN, Stack)];
410stack(T1={TN0, _, _}, Stack=[{TN1, _, _} | _Rest])
411  when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso
412       (TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) ->
413    [T1 | destack(TN1, Stack)];
414stack(T1, Stack) ->
415    [T1 | Stack].
416
417append_stack_child(StartTag, [{Name, Attrs, Acc} | Stack]) ->
418    [{Name, Attrs, [StartTag | Acc]} | Stack].
419
420destack(<<"br">>, Stack) ->
421    %% This is an ugly hack to make dumb_br_test() pass,
422    %% this makes it such that br can never have children.
423    Stack;
424destack(TagName, Stack) when is_list(Stack) ->
425    F = fun (X) ->
426                case X of
427                    {TagName, _, _} ->
428                        false;
429                    _ ->
430                        true
431                end
432        end,
433    case lists:splitwith(F, Stack) of
434        {_, []} ->
435            %% If we're parsing something like XML we might find
436            %% a <link>tag</link> that is normally a singleton
437            %% in HTML but isn't here
438            case {is_singleton(TagName), Stack} of
439                {true, [{T0, A0, Acc0} | Post0]} ->
440                    case lists:splitwith(F, Acc0) of
441                        {_, []} ->
442                            %% Actually was a singleton
443                            Stack;
444                        {Pre, [{T1, A1, Acc1} | Post1]} ->
445                            [{T0, A0, [{T1, A1, Acc1 ++ lists:reverse(Pre)} | Post1]}
446                             | Post0]
447                    end;
448                _ ->
449                    %% No match, no state change
450                    Stack
451            end;
452        {_Pre, [_T]} ->
453            %% Unfurl the whole stack, we're done
454            destack(Stack);
455        {Pre, [T, {T0, A0, Acc0} | Post]} ->
456            %% Unfurl up to the tag, then accumulate it
457            [{T0, A0, [destack(Pre ++ [T]) | Acc0]} | Post]
458    end.
459
460destack([{Tag, Attrs, Acc}]) ->
461    {Tag, Attrs, lists:reverse(Acc)};
462destack([{T1, A1, Acc1}, {T0, A0, Acc0} | Rest]) ->
463    destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} | Acc0]} | Rest]).
464
465is_singleton(<<"br">>) -> true;
466is_singleton(<<"hr">>) -> true;
467is_singleton(<<"img">>) -> true;
468is_singleton(<<"input">>) -> true;
469is_singleton(<<"base">>) -> true;
470is_singleton(<<"meta">>) -> true;
471is_singleton(<<"link">>) -> true;
472is_singleton(<<"area">>) -> true;
473is_singleton(<<"param">>) -> true;
474is_singleton(<<"col">>) -> true;
475is_singleton(_) -> false.
476
477tokenize_data(B, S=#decoder{offset=O}) ->
478    tokenize_data(B, S, O, true).
479
480tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) ->
481    case B of
482        <<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) ->
483            tokenize_data(B, ?INC_CHAR(S, C), Start,
484                          (Whitespace andalso ?IS_WHITESPACE(C)));
485        _ ->
486            Len = O - Start,
487            <<_:Start/binary, Data:Len/binary, _/binary>> = B,
488            {{data, Data, Whitespace}, S}
489    end.
490
491tokenize_attributes(B, S) ->
492    tokenize_attributes(B, S, []).
493
494tokenize_attributes(B, S=#decoder{offset=O}, Acc) ->
495    case B of
496        <<_:O/binary>> ->
497            {lists:reverse(Acc), S};
498        <<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) ->
499            {lists:reverse(Acc), S};
500        <<_:O/binary, "?>", _/binary>> ->
501            {lists:reverse(Acc), S};
502        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
503            tokenize_attributes(B, ?INC_CHAR(S, C), Acc);
504        _ ->
505            {Attr, S1} = tokenize_literal(B, S),
506            {Value, S2} = tokenize_attr_value(Attr, B, S1),
507            tokenize_attributes(B, S2, [{Attr, Value} | Acc])
508    end.
509
510tokenize_attr_value(Attr, B, S) ->
511    S1 = skip_whitespace(B, S),
512    O = S1#decoder.offset,
513    case B of
514        <<_:O/binary, "=", _/binary>> ->
515            S2 = skip_whitespace(B, ?INC_COL(S1)),
516            tokenize_quoted_or_unquoted_attr_value(B, S2);
517        _ ->
518            {Attr, S1}
519    end.
520
521tokenize_quoted_or_unquoted_attr_value(B, S=#decoder{offset=O}) ->
522    case B of
523        <<_:O/binary>> ->
524            { [], S };
525        <<_:O/binary, Q, _/binary>> when Q =:= ?QUOTE orelse
526                                         Q =:= ?SQUOTE ->
527            tokenize_quoted_attr_value(B, ?INC_COL(S), [], Q);
528        <<_:O/binary, _/binary>> ->
529            tokenize_unquoted_attr_value(B, S, [])
530    end.
531
532tokenize_quoted_attr_value(B, S=#decoder{offset=O}, Acc, Q) ->
533    case B of
534        <<_:O/binary>> ->
535            { iolist_to_binary(lists:reverse(Acc)), S };
536        <<_:O/binary, $&, _/binary>> ->
537            {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
538            tokenize_quoted_attr_value(B, S1, [Data|Acc], Q);
539        <<_:O/binary, Q, _/binary>> ->
540            { iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S) };
541        <<_:O/binary, C, _/binary>> ->
542            tokenize_quoted_attr_value(B, ?INC_COL(S), [C|Acc], Q)
543    end.
544
545tokenize_unquoted_attr_value(B, S=#decoder{offset=O}, Acc) ->
546    case B of
547        <<_:O/binary>> ->
548            { iolist_to_binary(lists:reverse(Acc)), S };
549        <<_:O/binary, $&, _/binary>> ->
550            {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)),
551            tokenize_unquoted_attr_value(B, S1, [Data|Acc]);
552        <<_:O/binary, $/, $>, _/binary>> ->
553            { iolist_to_binary(lists:reverse(Acc)), S };
554        <<_:O/binary, C, _/binary>> when ?PROBABLE_CLOSE(C) ->
555            { iolist_to_binary(lists:reverse(Acc)), S };
556        <<_:O/binary, C, _/binary>> ->
557            tokenize_unquoted_attr_value(B, ?INC_COL(S), [C|Acc])
558    end.
559
560skip_whitespace(B, S=#decoder{offset=O}) ->
561    case B of
562        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
563            skip_whitespace(B, ?INC_CHAR(S, C));
564        _ ->
565            S
566    end.
567
568tokenize_literal(Bin, S=#decoder{offset=O}) ->
569    case Bin of
570        <<_:O/binary, C, _/binary>> when C =:= $>
571                                    orelse C =:= $/
572                                    orelse C =:= $= ->
573            %% Handle case where tokenize_literal would consume
574            %% 0 chars. http://github.com/mochi/mochiweb/pull/13
575            {[C], ?INC_COL(S)};
576        _ ->
577            tokenize_literal(Bin, S, [])
578    end.
579
580tokenize_literal(Bin, S=#decoder{offset=O}, Acc) ->
581    case Bin of
582        <<_:O/binary, $&, _/binary>> ->
583            {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
584            tokenize_literal(Bin, S1, [Data | Acc]);
585        <<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C)
586                                              orelse C =:= $>
587                                              orelse C =:= $/
588                                              orelse C =:= $=) ->
589            tokenize_literal(Bin, ?INC_COL(S), [C | Acc]);
590        _ ->
591            {iolist_to_binary(string:to_lower(lists:reverse(Acc))), S}
592    end.
593
594raw_qgt(Bin, S=#decoder{offset=O}) ->
595    raw_qgt(Bin, S, O).
596
597raw_qgt(Bin, S=#decoder{offset=O}, Start) ->
598    case Bin of
599        <<_:O/binary, "?>", _/binary>> ->
600            Len = O - Start,
601            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
602            {Raw, ?ADV_COL(S, 2)};
603        <<_:O/binary, C, _/binary>> ->
604            raw_qgt(Bin, ?INC_CHAR(S, C), Start);
605        <<_:O/binary>> ->
606            <<_:Start/binary, Raw/binary>> = Bin,
607            {Raw, S}
608    end.
609
610find_qgt(Bin, S=#decoder{offset=O}) ->
611    case Bin of
612        <<_:O/binary, "?>", _/binary>> ->
613            ?ADV_COL(S, 2);
614        <<_:O/binary, ">", _/binary>> ->
615                        ?ADV_COL(S, 1);
616        <<_:O/binary, "/>", _/binary>> ->
617                        ?ADV_COL(S, 2);
618        %% tokenize_attributes takes care of this state:
619        %% <<_:O/binary, C, _/binary>> ->
620        %%     find_qgt(Bin, ?INC_CHAR(S, C));
621        <<_:O/binary>> ->
622            S
623    end.
624
625find_gt(Bin, S) ->
626    find_gt(Bin, S, false).
627
628find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->
629    case Bin of
630        <<_:O/binary, $/, _/binary>> ->
631            find_gt(Bin, ?INC_COL(S), true);
632        <<_:O/binary, $>, _/binary>> ->
633            {?INC_COL(S), HasSlash};
634        <<_:O/binary, C, _/binary>> ->
635            find_gt(Bin, ?INC_CHAR(S, C), HasSlash);
636        _ ->
637            {S, HasSlash}
638    end.
639
640tokenize_charref(Bin, S=#decoder{offset=O}) ->
641    try
642        tokenize_charref(Bin, S, O)
643    catch
644        throw:invalid_charref ->
645            {{data, <<"&">>, false}, S}
646    end.
647
648tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
649    case Bin of
650        <<_:O/binary>> ->
651            throw(invalid_charref);
652        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C)
653                                         orelse C =:= ?SQUOTE
654                                         orelse C =:= ?QUOTE
655                                         orelse C =:= $/
656                                         orelse C =:= $> ->
657            throw(invalid_charref);
658        <<_:O/binary, $;, _/binary>> ->
659            Len = O - Start,
660            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
661            Data = case mochiweb_charref:charref(Raw) of
662                       undefined ->
663                           throw(invalid_charref);
664                       Unichar when is_integer(Unichar) ->
665                           mochiutf8:codepoint_to_bytes(Unichar);
666                       Unichars when is_list(Unichars) ->
667                           unicode:characters_to_binary(Unichars)
668                   end,
669            {{data, Data, false}, ?INC_COL(S)};
670        _ ->
671            tokenize_charref(Bin, ?INC_COL(S), Start)
672    end.
673
674tokenize_doctype(Bin, S) ->
675    tokenize_doctype(Bin, S, []).
676
677tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) ->
678    case Bin of
679        <<_:O/binary>> ->
680            {{doctype, lists:reverse(Acc)}, S};
681        <<_:O/binary, $>, _/binary>> ->
682            {{doctype, lists:reverse(Acc)}, ?INC_COL(S)};
683        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
684            tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc);
685        _ ->
686            {Word, S1} = tokenize_word_or_literal(Bin, S),
687            tokenize_doctype(Bin, S1, [Word | Acc])
688    end.
689
690tokenize_word_or_literal(Bin, S=#decoder{offset=O}) ->
691    case Bin of
692        <<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE ->
693            tokenize_word(Bin, ?INC_COL(S), C);
694        <<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) ->
695            %% Sanity check for whitespace
696            tokenize_literal(Bin, S)
697    end.
698
699tokenize_word(Bin, S, Quote) ->
700    tokenize_word(Bin, S, Quote, []).
701
702tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) ->
703    case Bin of
704        <<_:O/binary>> ->
705            {iolist_to_binary(lists:reverse(Acc)), S};
706        <<_:O/binary, Quote, _/binary>> ->
707            {iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)};
708        <<_:O/binary, $&, _/binary>> ->
709            {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
710            tokenize_word(Bin, S1, Quote, [Data | Acc]);
711        <<_:O/binary, C, _/binary>> ->
712            tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C | Acc])
713    end.
714
715tokenize_cdata(Bin, S=#decoder{offset=O}) ->
716    tokenize_cdata(Bin, S, O).
717
718tokenize_cdata(Bin, S=#decoder{offset=O}, Start) ->
719    case Bin of
720        <<_:O/binary, "]]>", _/binary>> ->
721            Len = O - Start,
722            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
723            {{data, Raw, false}, ?ADV_COL(S, 3)};
724        <<_:O/binary, C, _/binary>> ->
725            tokenize_cdata(Bin, ?INC_CHAR(S, C), Start);
726        _ ->
727            <<_:O/binary, Raw/binary>> = Bin,
728            {{data, Raw, false}, S}
729    end.
730
731tokenize_comment(Bin, S=#decoder{offset=O}) ->
732    tokenize_comment(Bin, S, O).
733
734tokenize_comment(Bin, S=#decoder{offset=O}, Start) ->
735    case Bin of
736        <<_:O/binary, "-->", _/binary>> ->
737            Len = O - Start,
738            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
739            {{comment, Raw}, ?ADV_COL(S, 3)};
740        <<_:O/binary, C, _/binary>> ->
741            tokenize_comment(Bin, ?INC_CHAR(S, C), Start);
742        <<_:Start/binary, Raw/binary>> ->
743            {{comment, Raw}, S}
744    end.
745
746tokenize_script(Bin, S=#decoder{offset=O}) ->
747    tokenize_script(Bin, S, O).
748
749tokenize_script(Bin, S=#decoder{offset=O}, Start) ->
750    case Bin of
751        %% Just a look-ahead, we want the end_tag separately
752        <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>>
753        when (SS =:= $s orelse SS =:= $S) andalso
754             (CC =:= $c orelse CC =:= $C) andalso
755             (RR =:= $r orelse RR =:= $R) andalso
756             (II =:= $i orelse II =:= $I) andalso
757             (PP =:= $p orelse PP =:= $P) andalso
758             (TT=:= $t orelse TT =:= $T) andalso
759             ?PROBABLE_CLOSE(ZZ) ->
760            Len = O - Start,
761            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
762            {{data, Raw, false}, S};
763        <<_:O/binary, C, _/binary>> ->
764            tokenize_script(Bin, ?INC_CHAR(S, C), Start);
765        <<_:Start/binary, Raw/binary>> ->
766            {{data, Raw, false}, S}
767    end.
768
769tokenize_textarea(Bin, S=#decoder{offset=O}) ->
770    tokenize_textarea(Bin, S, O).
771
772tokenize_textarea(Bin, S=#decoder{offset=O}, Start) ->
773    case Bin of
774        %% Just a look-ahead, we want the end_tag separately
775        <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>>
776        when (TT =:= $t orelse TT =:= $T) andalso
777             (EE =:= $e orelse EE =:= $E) andalso
778             (XX =:= $x orelse XX =:= $X) andalso
779             (TT2 =:= $t orelse TT2 =:= $T) andalso
780             (AA =:= $a orelse AA =:= $A) andalso
781             (RR =:= $r orelse RR =:= $R) andalso
782             (EE2 =:= $e orelse EE2 =:= $E) andalso
783             (AA2 =:= $a orelse AA2 =:= $A) andalso
784             ?PROBABLE_CLOSE(ZZ) ->
785            Len = O - Start,
786            <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
787            {{data, Raw, false}, S};
788        <<_:O/binary, C, _/binary>> ->
789            tokenize_textarea(Bin, ?INC_CHAR(S, C), Start);
790        <<_:Start/binary, Raw/binary>> ->
791            {{data, Raw, false}, S}
792    end.