/src/mochiweb_html.erl
Erlang | 792 lines | 649 code | 64 blank | 79 comment | 1 complexity | b945da9d1ef5644e84e7bdaf30a7033a MD5 | raw file
1%% @author Bob Ippolito <bob@mochimedia.com> 2%% @copyright 2007 Mochi Media, Inc. 3%% 4%% Permission is hereby granted, free of charge, to any person obtaining a 5%% copy of this software and associated documentation files (the "Software"), 6%% to deal in the Software without restriction, including without limitation 7%% the rights to use, copy, modify, merge, publish, distribute, sublicense, 8%% and/or sell copies of the Software, and to permit persons to whom the 9%% Software is furnished to do so, subject to the following conditions: 10%% 11%% The above copyright notice and this permission notice shall be included in 12%% all copies or substantial portions of the Software. 13%% 14%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17%% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19%% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20%% DEALINGS IN THE SOFTWARE. 21 22%% @doc Loosely tokenizes and generates parse trees for HTML 4. 23-module(mochiweb_html). 24-export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1, 25 escape_attr/1, to_html/1]). 26-compile([export_all]). 27-ifdef(TEST). 28-export([destack/1, destack/2, is_singleton/1]). 29-endif. 30 31%% This is a macro to placate syntax highlighters.. 32-define(QUOTE, $\"). %% $\" 33-define(SQUOTE, $\'). %% $\' 34-define(ADV_COL(S, N), 35 S#decoder{column=N+S#decoder.column, 36 offset=N+S#decoder.offset}). 37-define(INC_COL(S), 38 S#decoder{column=1+S#decoder.column, 39 offset=1+S#decoder.offset}). 40-define(INC_LINE(S), 41 S#decoder{column=1, 42 line=1+S#decoder.line, 43 offset=1+S#decoder.offset}). 44-define(INC_CHAR(S, C), 45 case C of 46 $\n -> 47 S#decoder{column=1, 48 line=1+S#decoder.line, 49 offset=1+S#decoder.offset}; 50 _ -> 51 S#decoder{column=1+S#decoder.column, 52 offset=1+S#decoder.offset} 53 end). 54 55-define(IS_WHITESPACE(C), 56 (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)). 57-define(IS_LITERAL_SAFE(C), 58 ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z) 59 orelse (C >= $0 andalso C =< $9))). 60-define(PROBABLE_CLOSE(C), 61 (C =:= $> orelse ?IS_WHITESPACE(C))). 62 63-record(decoder, {line=1, 64 column=1, 65 offset=0}). 66 67%% @type html_node() = {string(), [html_attr()], [html_node() | string()]} 68%% @type html_attr() = {string(), string()} 69%% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype() 70%% @type html_data() = {data, string(), Whitespace::boolean()} 71%% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()} 72%% @type end_tag() = {end_tag, Name} 73%% @type html_comment() = {comment, Comment} 74%% @type html_doctype() = {doctype, [Doctype]} 75%% @type inline_html() = {'=', iolist()} 76 77%% External API. 78 79%% @spec parse(string() | binary()) -> html_node() 80%% @doc tokenize and then transform the token stream into a HTML tree. 81parse(Input) -> 82 parse_tokens(tokens(Input)). 83 84%% @spec parse_tokens([html_token()]) -> html_node() 85%% @doc Transform the output of tokens(Doc) into a HTML tree. 86parse_tokens(Tokens) when is_list(Tokens) -> 87 %% Skip over doctype, processing instructions 88 [{start_tag, Tag, Attrs, false} | Rest] = find_document(Tokens, normal), 89 {Tree, _} = tree(Rest, [norm({Tag, Attrs})]), 90 Tree. 91 92find_document(Tokens=[{start_tag, _Tag, _Attrs, false} | _Rest], Mode) -> 93 maybe_add_html_tag(Tokens, Mode); 94find_document([{doctype, [<<"html">>]} | Rest], _Mode) -> 95 find_document(Rest, html5); 96find_document([_T | Rest], Mode) -> 97 find_document(Rest, Mode); 98find_document([], _Mode) -> 99 []. 100 101maybe_add_html_tag(Tokens=[{start_tag, Tag, _Attrs, false} | _], html5) 102 when Tag =/= <<"html">> -> 103 [{start_tag, <<"html">>, [], false} | Tokens]; 104maybe_add_html_tag(Tokens, _Mode) -> 105 Tokens. 106 107%% @spec tokens(StringOrBinary) -> [html_token()] 108%% @doc Transform the input UTF-8 HTML into a token stream. 109tokens(Input) -> 110 tokens(iolist_to_binary(Input), #decoder{}, []). 111 112%% @spec to_tokens(html_node()) -> [html_token()] 113%% @doc Convert a html_node() tree to a list of tokens. 114to_tokens({Tag0}) -> 115 to_tokens({Tag0, [], []}); 116to_tokens(T={'=', _}) -> 117 [T]; 118to_tokens(T={doctype, _}) -> 119 [T]; 120to_tokens(T={comment, _}) -> 121 [T]; 122to_tokens({Tag0, Acc}) -> 123 %% This is only allowed in sub-tags: {p, [{"class", "foo"}]} 124 to_tokens({Tag0, [], Acc}); 125to_tokens({Tag0, Attrs, Acc}) -> 126 Tag = to_tag(Tag0), 127 case is_singleton(Tag) of 128 true -> 129 to_tokens([], [{start_tag, Tag, Attrs, true}]); 130 false -> 131 to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, false}]) 132 end. 133 134%% @spec to_html([html_token()] | html_node()) -> iolist() 135%% @doc Convert a list of html_token() to a HTML document. 136to_html(Node) when is_tuple(Node) -> 137 to_html(to_tokens(Node)); 138to_html(Tokens) when is_list(Tokens) -> 139 to_html(Tokens, []). 140 141%% @spec escape(string() | atom() | binary()) -> binary() 142%% @doc Escape a string such that it's safe for HTML (amp; lt; gt;). 143escape(B) when is_binary(B) -> 144 escape(binary_to_list(B), []); 145escape(A) when is_atom(A) -> 146 escape(atom_to_list(A), []); 147escape(S) when is_list(S) -> 148 escape(S, []). 149 150%% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary() 151%% @doc Escape a string such that it's safe for HTML attrs 152%% (amp; lt; gt; quot;). 153escape_attr(B) when is_binary(B) -> 154 escape_attr(binary_to_list(B), []); 155escape_attr(A) when is_atom(A) -> 156 escape_attr(atom_to_list(A), []); 157escape_attr(S) when is_list(S) -> 158 escape_attr(S, []); 159escape_attr(I) when is_integer(I) -> 160 escape_attr(integer_to_list(I), []); 161escape_attr(F) when is_float(F) -> 162 escape_attr(mochinum:digits(F), []). 163 164to_html([], Acc) -> 165 lists:reverse(Acc); 166to_html([{'=', Content} | Rest], Acc) -> 167 to_html(Rest, [Content | Acc]); 168to_html([{pi, Bin} | Rest], Acc) -> 169 Open = [<<"<?">>, 170 Bin, 171 <<"?>">>], 172 to_html(Rest, [Open | Acc]); 173to_html([{pi, Tag, Attrs} | Rest], Acc) -> 174 Open = [<<"<?">>, 175 Tag, 176 attrs_to_html(Attrs, []), 177 <<"?>">>], 178 to_html(Rest, [Open | Acc]); 179to_html([{comment, Comment} | Rest], Acc) -> 180 to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] | Acc]); 181to_html([{doctype, Parts} | Rest], Acc) -> 182 Inside = doctype_to_html(Parts, Acc), 183 to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] | Acc]); 184to_html([{data, Data, _Whitespace} | Rest], Acc) -> 185 to_html(Rest, [escape(Data) | Acc]); 186to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc) -> 187 Open = [<<"<">>, 188 Tag, 189 attrs_to_html(Attrs, []), 190 case Singleton of 191 true -> <<" />">>; 192 false -> <<">">> 193 end], 194 to_html(Rest, [Open | Acc]); 195to_html([{end_tag, Tag} | Rest], Acc) -> 196 to_html(Rest, [[<<"</">>, Tag, <<">">>] | Acc]). 197 198doctype_to_html([], Acc) -> 199 lists:reverse(Acc); 200doctype_to_html([Word | Rest], Acc) -> 201 case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end, 202 binary_to_list(iolist_to_binary(Word))) of 203 true -> 204 doctype_to_html(Rest, [[<<" ">>, Word] | Acc]); 205 false -> 206 doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc]) 207 end. 208 209attrs_to_html([], Acc) -> 210 lists:reverse(Acc); 211attrs_to_html([{K, V} | Rest], Acc) -> 212 attrs_to_html(Rest, 213 [[<<" ">>, escape(K), <<"=\"">>, 214 escape_attr(V), <<"\"">>] | Acc]). 215 216escape([], Acc) -> 217 list_to_binary(lists:reverse(Acc)); 218escape("<" ++ Rest, Acc) -> 219 escape(Rest, lists:reverse("<", Acc)); 220escape(">" ++ Rest, Acc) -> 221 escape(Rest, lists:reverse(">", Acc)); 222escape("&" ++ Rest, Acc) -> 223 escape(Rest, lists:reverse("&", Acc)); 224escape([C | Rest], Acc) -> 225 escape(Rest, [C | Acc]). 226 227escape_attr([], Acc) -> 228 list_to_binary(lists:reverse(Acc)); 229escape_attr("<" ++ Rest, Acc) -> 230 escape_attr(Rest, lists:reverse("<", Acc)); 231escape_attr(">" ++ Rest, Acc) -> 232 escape_attr(Rest, lists:reverse(">", Acc)); 233escape_attr("&" ++ Rest, Acc) -> 234 escape_attr(Rest, lists:reverse("&", Acc)); 235escape_attr([?QUOTE | Rest], Acc) -> 236 escape_attr(Rest, lists:reverse(""", Acc)); 237escape_attr([C | Rest], Acc) -> 238 escape_attr(Rest, [C | Acc]). 239 240to_tag(A) when is_atom(A) -> 241 norm(atom_to_list(A)); 242to_tag(L) -> 243 norm(L). 244 245to_tokens([], Acc) -> 246 lists:reverse(Acc); 247to_tokens([{Tag, []} | Rest], Acc) -> 248 to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]); 249to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) -> 250 %% Allow {br} 251 to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc); 252to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) -> 253 %% Allow {'=', iolist()} 254 to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); 255to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) -> 256 %% Allow {comment, iolist()} 257 to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); 258to_tokens([{Tag0, [T0={pi, _S0} | R1]} | Rest], Acc) -> 259 %% Allow {pi, binary()} 260 to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); 261to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) -> 262 %% Allow {pi, binary(), list()} 263 to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); 264to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) -> 265 %% Allow {p, [{"class", "foo"}]} 266 to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc); 267to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) -> 268 %% Allow {p, "content"} and {p, <<"content">>} 269 to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc); 270to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) -> 271 %% Allow {"p", [{"class", "foo"}], <<"content">>} 272 to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc); 273to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc) 274 when is_integer(C) -> 275 %% Allow {"p", [{"class", "foo"}], "content"} 276 to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc); 277to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) -> 278 %% Native {"p", [{"class", "foo"}], ["content"]} 279 Tag = to_tag(Tag0), 280 T1 = to_tag(T0), 281 case is_singleton(norm(T1)) of 282 true -> 283 to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]); 284 false -> 285 to_tokens([{T1, C1}, {Tag, R1} | Rest], 286 [{start_tag, T1, A1, false} | Acc]) 287 end; 288to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) -> 289 %% List text 290 Tag = to_tag(Tag0), 291 to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]); 292to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) -> 293 %% Binary text 294 Tag = to_tag(Tag0), 295 to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]). 296 297tokens(B, S=#decoder{offset=O}, Acc) -> 298 case B of 299 <<_:O/binary>> -> 300 lists:reverse(Acc); 301 _ -> 302 {Tag, S1} = tokenize(B, S), 303 case parse_flag(Tag) of 304 script -> 305 {Tag2, S2} = tokenize_script(B, S1), 306 tokens(B, S2, [Tag2, Tag | Acc]); 307 textarea -> 308 {Tag2, S2} = tokenize_textarea(B, S1), 309 tokens(B, S2, [Tag2, Tag | Acc]); 310 none -> 311 tokens(B, S1, [Tag | Acc]) 312 end 313 end. 314 315parse_flag({start_tag, B, _, false}) -> 316 case string:to_lower(binary_to_list(B)) of 317 "script" -> 318 script; 319 "textarea" -> 320 textarea; 321 _ -> 322 none 323 end; 324parse_flag(_) -> 325 none. 326 327tokenize(B, S=#decoder{offset=O}) -> 328 case B of 329 <<_:O/binary, "<!--", _/binary>> -> 330 tokenize_comment(B, ?ADV_COL(S, 4)); 331 <<_:O/binary, "<!doctype", _/binary>> -> 332 tokenize_doctype(B, ?ADV_COL(S, 10)); 333 <<_:O/binary, "<!DOCTYPE", _/binary>> -> 334 tokenize_doctype(B, ?ADV_COL(S, 10)); 335 <<_:O/binary, "<![CDATA[", _/binary>> -> 336 tokenize_cdata(B, ?ADV_COL(S, 9)); 337 <<_:O/binary, "<?php", _/binary>> -> 338 {Body, S1} = raw_qgt(B, ?ADV_COL(S, 2)), 339 {{pi, Body}, S1}; 340 <<_:O/binary, "<?", _/binary>> -> 341 {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)), 342 {Attrs, S2} = tokenize_attributes(B, S1), 343 S3 = find_qgt(B, S2), 344 {{pi, Tag, Attrs}, S3}; 345 <<_:O/binary, "&", _/binary>> -> 346 tokenize_charref(B, ?INC_COL(S)); 347 <<_:O/binary, "</", _/binary>> -> 348 {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)), 349 {S2, _} = find_gt(B, S1), 350 {{end_tag, Tag}, S2}; 351 <<_:O/binary, "<", C, _/binary>> 352 when ?IS_WHITESPACE(C); not ?IS_LITERAL_SAFE(C) -> 353 %% This isn't really strict HTML 354 {{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)), 355 {{data, <<$<, Data/binary>>, false}, S1}; 356 <<_:O/binary, "<", _/binary>> -> 357 {Tag, S1} = tokenize_literal(B, ?INC_COL(S)), 358 {Attrs, S2} = tokenize_attributes(B, S1), 359 {S3, HasSlash} = find_gt(B, S2), 360 Singleton = HasSlash orelse is_singleton(Tag), 361 {{start_tag, Tag, Attrs, Singleton}, S3}; 362 _ -> 363 tokenize_data(B, S) 364 end. 365 366tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) -> 367 tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]); 368tree_data(Rest, AllWhitespace, Acc) -> 369 {iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}. 370 371tree([], Stack) -> 372 {destack(Stack), []}; 373tree([{end_tag, Tag} | Rest], Stack) -> 374 case destack(norm(Tag), Stack) of 375 S when is_list(S) -> 376 tree(Rest, S); 377 Result -> 378 {Result, []} 379 end; 380tree([{start_tag, Tag, Attrs, true} | Rest], S) -> 381 tree(Rest, append_stack_child(norm({Tag, Attrs}), S)); 382tree([{start_tag, Tag, Attrs, false} | Rest], S) -> 383 tree(Rest, stack(norm({Tag, Attrs}), S)); 384tree([T={pi, _Raw} | Rest], S) -> 385 tree(Rest, append_stack_child(T, S)); 386tree([T={pi, _Tag, _Attrs} | Rest], S) -> 387 tree(Rest, append_stack_child(T, S)); 388tree([T={comment, _Comment} | Rest], S) -> 389 tree(Rest, append_stack_child(T, S)); 390tree(L=[{data, _Data, _Whitespace} | _], S) -> 391 case tree_data(L, true, []) of 392 {_, true, Rest} -> 393 tree(Rest, S); 394 {Data, false, Rest} -> 395 tree(Rest, append_stack_child(Data, S)) 396 end; 397tree([{doctype, _} | Rest], Stack) -> 398 tree(Rest, Stack). 399 400norm({Tag, Attrs}) -> 401 {norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []}; 402norm(Tag) when is_binary(Tag) -> 403 Tag; 404norm(Tag) -> 405 list_to_binary(string:to_lower(Tag)). 406 407stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest]) 408 when TN =:= <<"li">> orelse TN =:= <<"option">> -> 409 [T1 | destack(TN, Stack)]; 410stack(T1={TN0, _, _}, Stack=[{TN1, _, _} | _Rest]) 411 when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso 412 (TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) -> 413 [T1 | destack(TN1, Stack)]; 414stack(T1, Stack) -> 415 [T1 | Stack]. 416 417append_stack_child(StartTag, [{Name, Attrs, Acc} | Stack]) -> 418 [{Name, Attrs, [StartTag | Acc]} | Stack]. 419 420destack(<<"br">>, Stack) -> 421 %% This is an ugly hack to make dumb_br_test() pass, 422 %% this makes it such that br can never have children. 423 Stack; 424destack(TagName, Stack) when is_list(Stack) -> 425 F = fun (X) -> 426 case X of 427 {TagName, _, _} -> 428 false; 429 _ -> 430 true 431 end 432 end, 433 case lists:splitwith(F, Stack) of 434 {_, []} -> 435 %% If we're parsing something like XML we might find 436 %% a <link>tag</link> that is normally a singleton 437 %% in HTML but isn't here 438 case {is_singleton(TagName), Stack} of 439 {true, [{T0, A0, Acc0} | Post0]} -> 440 case lists:splitwith(F, Acc0) of 441 {_, []} -> 442 %% Actually was a singleton 443 Stack; 444 {Pre, [{T1, A1, Acc1} | Post1]} -> 445 [{T0, A0, [{T1, A1, Acc1 ++ lists:reverse(Pre)} | Post1]} 446 | Post0] 447 end; 448 _ -> 449 %% No match, no state change 450 Stack 451 end; 452 {_Pre, [_T]} -> 453 %% Unfurl the whole stack, we're done 454 destack(Stack); 455 {Pre, [T, {T0, A0, Acc0} | Post]} -> 456 %% Unfurl up to the tag, then accumulate it 457 [{T0, A0, [destack(Pre ++ [T]) | Acc0]} | Post] 458 end. 459 460destack([{Tag, Attrs, Acc}]) -> 461 {Tag, Attrs, lists:reverse(Acc)}; 462destack([{T1, A1, Acc1}, {T0, A0, Acc0} | Rest]) -> 463 destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} | Acc0]} | Rest]). 464 465is_singleton(<<"br">>) -> true; 466is_singleton(<<"hr">>) -> true; 467is_singleton(<<"img">>) -> true; 468is_singleton(<<"input">>) -> true; 469is_singleton(<<"base">>) -> true; 470is_singleton(<<"meta">>) -> true; 471is_singleton(<<"link">>) -> true; 472is_singleton(<<"area">>) -> true; 473is_singleton(<<"param">>) -> true; 474is_singleton(<<"col">>) -> true; 475is_singleton(_) -> false. 476 477tokenize_data(B, S=#decoder{offset=O}) -> 478 tokenize_data(B, S, O, true). 479 480tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) -> 481 case B of 482 <<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) -> 483 tokenize_data(B, ?INC_CHAR(S, C), Start, 484 (Whitespace andalso ?IS_WHITESPACE(C))); 485 _ -> 486 Len = O - Start, 487 <<_:Start/binary, Data:Len/binary, _/binary>> = B, 488 {{data, Data, Whitespace}, S} 489 end. 490 491tokenize_attributes(B, S) -> 492 tokenize_attributes(B, S, []). 493 494tokenize_attributes(B, S=#decoder{offset=O}, Acc) -> 495 case B of 496 <<_:O/binary>> -> 497 {lists:reverse(Acc), S}; 498 <<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) -> 499 {lists:reverse(Acc), S}; 500 <<_:O/binary, "?>", _/binary>> -> 501 {lists:reverse(Acc), S}; 502 <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) -> 503 tokenize_attributes(B, ?INC_CHAR(S, C), Acc); 504 _ -> 505 {Attr, S1} = tokenize_literal(B, S), 506 {Value, S2} = tokenize_attr_value(Attr, B, S1), 507 tokenize_attributes(B, S2, [{Attr, Value} | Acc]) 508 end. 509 510tokenize_attr_value(Attr, B, S) -> 511 S1 = skip_whitespace(B, S), 512 O = S1#decoder.offset, 513 case B of 514 <<_:O/binary, "=", _/binary>> -> 515 S2 = skip_whitespace(B, ?INC_COL(S1)), 516 tokenize_quoted_or_unquoted_attr_value(B, S2); 517 _ -> 518 {Attr, S1} 519 end. 520 521tokenize_quoted_or_unquoted_attr_value(B, S=#decoder{offset=O}) -> 522 case B of 523 <<_:O/binary>> -> 524 { [], S }; 525 <<_:O/binary, Q, _/binary>> when Q =:= ?QUOTE orelse 526 Q =:= ?SQUOTE -> 527 tokenize_quoted_attr_value(B, ?INC_COL(S), [], Q); 528 <<_:O/binary, _/binary>> -> 529 tokenize_unquoted_attr_value(B, S, []) 530 end. 531 532tokenize_quoted_attr_value(B, S=#decoder{offset=O}, Acc, Q) -> 533 case B of 534 <<_:O/binary>> -> 535 { iolist_to_binary(lists:reverse(Acc)), S }; 536 <<_:O/binary, $&, _/binary>> -> 537 {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)), 538 tokenize_quoted_attr_value(B, S1, [Data|Acc], Q); 539 <<_:O/binary, Q, _/binary>> -> 540 { iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S) }; 541 <<_:O/binary, C, _/binary>> -> 542 tokenize_quoted_attr_value(B, ?INC_COL(S), [C|Acc], Q) 543 end. 544 545tokenize_unquoted_attr_value(B, S=#decoder{offset=O}, Acc) -> 546 case B of 547 <<_:O/binary>> -> 548 { iolist_to_binary(lists:reverse(Acc)), S }; 549 <<_:O/binary, $&, _/binary>> -> 550 {{data, Data, false}, S1} = tokenize_charref(B, ?INC_COL(S)), 551 tokenize_unquoted_attr_value(B, S1, [Data|Acc]); 552 <<_:O/binary, $/, $>, _/binary>> -> 553 { iolist_to_binary(lists:reverse(Acc)), S }; 554 <<_:O/binary, C, _/binary>> when ?PROBABLE_CLOSE(C) -> 555 { iolist_to_binary(lists:reverse(Acc)), S }; 556 <<_:O/binary, C, _/binary>> -> 557 tokenize_unquoted_attr_value(B, ?INC_COL(S), [C|Acc]) 558 end. 559 560skip_whitespace(B, S=#decoder{offset=O}) -> 561 case B of 562 <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) -> 563 skip_whitespace(B, ?INC_CHAR(S, C)); 564 _ -> 565 S 566 end. 567 568tokenize_literal(Bin, S=#decoder{offset=O}) -> 569 case Bin of 570 <<_:O/binary, C, _/binary>> when C =:= $> 571 orelse C =:= $/ 572 orelse C =:= $= -> 573 %% Handle case where tokenize_literal would consume 574 %% 0 chars. http://github.com/mochi/mochiweb/pull/13 575 {[C], ?INC_COL(S)}; 576 _ -> 577 tokenize_literal(Bin, S, []) 578 end. 579 580tokenize_literal(Bin, S=#decoder{offset=O}, Acc) -> 581 case Bin of 582 <<_:O/binary, $&, _/binary>> -> 583 {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)), 584 tokenize_literal(Bin, S1, [Data | Acc]); 585 <<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C) 586 orelse C =:= $> 587 orelse C =:= $/ 588 orelse C =:= $=) -> 589 tokenize_literal(Bin, ?INC_COL(S), [C | Acc]); 590 _ -> 591 {iolist_to_binary(string:to_lower(lists:reverse(Acc))), S} 592 end. 593 594raw_qgt(Bin, S=#decoder{offset=O}) -> 595 raw_qgt(Bin, S, O). 596 597raw_qgt(Bin, S=#decoder{offset=O}, Start) -> 598 case Bin of 599 <<_:O/binary, "?>", _/binary>> -> 600 Len = O - Start, 601 <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, 602 {Raw, ?ADV_COL(S, 2)}; 603 <<_:O/binary, C, _/binary>> -> 604 raw_qgt(Bin, ?INC_CHAR(S, C), Start); 605 <<_:O/binary>> -> 606 <<_:Start/binary, Raw/binary>> = Bin, 607 {Raw, S} 608 end. 609 610find_qgt(Bin, S=#decoder{offset=O}) -> 611 case Bin of 612 <<_:O/binary, "?>", _/binary>> -> 613 ?ADV_COL(S, 2); 614 <<_:O/binary, ">", _/binary>> -> 615 ?ADV_COL(S, 1); 616 <<_:O/binary, "/>", _/binary>> -> 617 ?ADV_COL(S, 2); 618 %% tokenize_attributes takes care of this state: 619 %% <<_:O/binary, C, _/binary>> -> 620 %% find_qgt(Bin, ?INC_CHAR(S, C)); 621 <<_:O/binary>> -> 622 S 623 end. 624 625find_gt(Bin, S) -> 626 find_gt(Bin, S, false). 627 628find_gt(Bin, S=#decoder{offset=O}, HasSlash) -> 629 case Bin of 630 <<_:O/binary, $/, _/binary>> -> 631 find_gt(Bin, ?INC_COL(S), true); 632 <<_:O/binary, $>, _/binary>> -> 633 {?INC_COL(S), HasSlash}; 634 <<_:O/binary, C, _/binary>> -> 635 find_gt(Bin, ?INC_CHAR(S, C), HasSlash); 636 _ -> 637 {S, HasSlash} 638 end. 639 640tokenize_charref(Bin, S=#decoder{offset=O}) -> 641 try 642 tokenize_charref(Bin, S, O) 643 catch 644 throw:invalid_charref -> 645 {{data, <<"&">>, false}, S} 646 end. 647 648tokenize_charref(Bin, S=#decoder{offset=O}, Start) -> 649 case Bin of 650 <<_:O/binary>> -> 651 throw(invalid_charref); 652 <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) 653 orelse C =:= ?SQUOTE 654 orelse C =:= ?QUOTE 655 orelse C =:= $/ 656 orelse C =:= $> -> 657 throw(invalid_charref); 658 <<_:O/binary, $;, _/binary>> -> 659 Len = O - Start, 660 <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, 661 Data = case mochiweb_charref:charref(Raw) of 662 undefined -> 663 throw(invalid_charref); 664 Unichar when is_integer(Unichar) -> 665 mochiutf8:codepoint_to_bytes(Unichar); 666 Unichars when is_list(Unichars) -> 667 unicode:characters_to_binary(Unichars) 668 end, 669 {{data, Data, false}, ?INC_COL(S)}; 670 _ -> 671 tokenize_charref(Bin, ?INC_COL(S), Start) 672 end. 673 674tokenize_doctype(Bin, S) -> 675 tokenize_doctype(Bin, S, []). 676 677tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) -> 678 case Bin of 679 <<_:O/binary>> -> 680 {{doctype, lists:reverse(Acc)}, S}; 681 <<_:O/binary, $>, _/binary>> -> 682 {{doctype, lists:reverse(Acc)}, ?INC_COL(S)}; 683 <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) -> 684 tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc); 685 _ -> 686 {Word, S1} = tokenize_word_or_literal(Bin, S), 687 tokenize_doctype(Bin, S1, [Word | Acc]) 688 end. 689 690tokenize_word_or_literal(Bin, S=#decoder{offset=O}) -> 691 case Bin of 692 <<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE -> 693 tokenize_word(Bin, ?INC_COL(S), C); 694 <<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) -> 695 %% Sanity check for whitespace 696 tokenize_literal(Bin, S) 697 end. 698 699tokenize_word(Bin, S, Quote) -> 700 tokenize_word(Bin, S, Quote, []). 701 702tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) -> 703 case Bin of 704 <<_:O/binary>> -> 705 {iolist_to_binary(lists:reverse(Acc)), S}; 706 <<_:O/binary, Quote, _/binary>> -> 707 {iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)}; 708 <<_:O/binary, $&, _/binary>> -> 709 {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)), 710 tokenize_word(Bin, S1, Quote, [Data | Acc]); 711 <<_:O/binary, C, _/binary>> -> 712 tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C | Acc]) 713 end. 714 715tokenize_cdata(Bin, S=#decoder{offset=O}) -> 716 tokenize_cdata(Bin, S, O). 717 718tokenize_cdata(Bin, S=#decoder{offset=O}, Start) -> 719 case Bin of 720 <<_:O/binary, "]]>", _/binary>> -> 721 Len = O - Start, 722 <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, 723 {{data, Raw, false}, ?ADV_COL(S, 3)}; 724 <<_:O/binary, C, _/binary>> -> 725 tokenize_cdata(Bin, ?INC_CHAR(S, C), Start); 726 _ -> 727 <<_:O/binary, Raw/binary>> = Bin, 728 {{data, Raw, false}, S} 729 end. 730 731tokenize_comment(Bin, S=#decoder{offset=O}) -> 732 tokenize_comment(Bin, S, O). 733 734tokenize_comment(Bin, S=#decoder{offset=O}, Start) -> 735 case Bin of 736 <<_:O/binary, "-->", _/binary>> -> 737 Len = O - Start, 738 <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, 739 {{comment, Raw}, ?ADV_COL(S, 3)}; 740 <<_:O/binary, C, _/binary>> -> 741 tokenize_comment(Bin, ?INC_CHAR(S, C), Start); 742 <<_:Start/binary, Raw/binary>> -> 743 {{comment, Raw}, S} 744 end. 745 746tokenize_script(Bin, S=#decoder{offset=O}) -> 747 tokenize_script(Bin, S, O). 748 749tokenize_script(Bin, S=#decoder{offset=O}, Start) -> 750 case Bin of 751 %% Just a look-ahead, we want the end_tag separately 752 <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>> 753 when (SS =:= $s orelse SS =:= $S) andalso 754 (CC =:= $c orelse CC =:= $C) andalso 755 (RR =:= $r orelse RR =:= $R) andalso 756 (II =:= $i orelse II =:= $I) andalso 757 (PP =:= $p orelse PP =:= $P) andalso 758 (TT=:= $t orelse TT =:= $T) andalso 759 ?PROBABLE_CLOSE(ZZ) -> 760 Len = O - Start, 761 <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, 762 {{data, Raw, false}, S}; 763 <<_:O/binary, C, _/binary>> -> 764 tokenize_script(Bin, ?INC_CHAR(S, C), Start); 765 <<_:Start/binary, Raw/binary>> -> 766 {{data, Raw, false}, S} 767 end. 768 769tokenize_textarea(Bin, S=#decoder{offset=O}) -> 770 tokenize_textarea(Bin, S, O). 771 772tokenize_textarea(Bin, S=#decoder{offset=O}, Start) -> 773 case Bin of 774 %% Just a look-ahead, we want the end_tag separately 775 <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>> 776 when (TT =:= $t orelse TT =:= $T) andalso 777 (EE =:= $e orelse EE =:= $E) andalso 778 (XX =:= $x orelse XX =:= $X) andalso 779 (TT2 =:= $t orelse TT2 =:= $T) andalso 780 (AA =:= $a orelse AA =:= $A) andalso 781 (RR =:= $r orelse RR =:= $R) andalso 782 (EE2 =:= $e orelse EE2 =:= $E) andalso 783 (AA2 =:= $a orelse AA2 =:= $A) andalso 784 ?PROBABLE_CLOSE(ZZ) -> 785 Len = O - Start, 786 <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, 787 {{data, Raw, false}, S}; 788 <<_:O/binary, C, _/binary>> -> 789 tokenize_textarea(Bin, ?INC_CHAR(S, C), Start); 790 <<_:Start/binary, Raw/binary>> -> 791 {{data, Raw, false}, S} 792 end.