/scripts/entities.erl
Erlang | 45 lines | 35 code | 7 blank | 3 comment | 2 complexity | b00b74e3ca077fa3fdb78cf07f47bfae MD5 | raw file
Possible License(s): MIT
- #!/usr/bin/env escript
- %% -*- mode: erlang -*-
- -export([main/1]).
- %% @doc Script used to generate mochiweb_charref.erl table.
- main(_) ->
- application:start(inets),
- code:add_patha("ebin"),
- {ok, {_, _, HTML}} = httpc:request("http://www.w3.org/TR/html5/named-character-references.html"),
- print(lists:sort(search(mochiweb_html:parse(HTML)))).
- print([F | T]) ->
- io:put_chars([clause(F), ";\n"]),
- print(T);
- print([]) ->
- io:put_chars(["entity(_) -> undefined.\n"]),
- ok.
- clause({Title, [Codepoint]}) ->
- ["entity(\"", Title, "\") -> 16#", Codepoint];
- clause({Title, [First | Rest]}) ->
- ["entity(\"", Title, "\") -> [16#", First,
- [[", 16#", Codepoint] || Codepoint <- Rest],
- "]"].
- search(Elem) ->
- search(Elem, []).
- search({<<"tr">>, [{<<"id">>, <<"entity-", _/binary>>} | _], Children}, Acc) ->
- %% HTML5 charrefs can have more than one code point(!)
- [{<<"td">>, _, [{<<"code">>, _, [TitleSemi]}]},
- {<<"td">>, [], [RawCPs]} | _] = Children,
- L = byte_size(TitleSemi) - 1,
- <<Title:L/binary, $;>> = TitleSemi,
- {match, Matches} = re:run(RawCPs, "(?:\\s*U\\+)([a-fA-F0-9]+)",
- [{capture, all, binary}, global]),
- [{Title, [CP || [_, CP] <- Matches]} | Acc];
- search({Tag, Attrs, [H | T]}, Acc) ->
- search({Tag, Attrs, T}, search(H, Acc));
- search({_Tag, _Attrs, []}, Acc) ->
- Acc;
- search(<<_/binary>>, Acc) ->
- Acc.