PageRenderTime 82ms CodeModel.GetById 40ms app.highlight 8ms RepoModel.GetById 32ms app.codeStats 0ms

/scripts/entities.erl

http://github.com/basho/mochiweb
Erlang | 45 lines | 35 code | 7 blank | 3 comment | 2 complexity | b00b74e3ca077fa3fdb78cf07f47bfae MD5 | raw file
 1#!/usr/bin/env escript
 2%% -*- mode: erlang -*-
 3-export([main/1]).
 4
 5%% @doc Script used to generate mochiweb_charref.erl table.
 6
 7main(_) ->
 8    application:start(inets),
 9    code:add_patha("ebin"),
10    {ok, {_, _, HTML}} = httpc:request("http://www.w3.org/TR/html5/named-character-references.html"),
11    print(lists:sort(search(mochiweb_html:parse(HTML)))).
12
13print([F | T]) ->
14    io:put_chars([clause(F), ";\n"]),
15    print(T);
16print([]) ->
17    io:put_chars(["entity(_) -> undefined.\n"]),
18    ok.
19
20clause({Title, [Codepoint]}) ->
21    ["entity(\"", Title, "\") -> 16#", Codepoint];
22clause({Title, [First | Rest]}) ->
23    ["entity(\"", Title, "\") -> [16#", First,
24     [[", 16#", Codepoint] || Codepoint <- Rest],
25     "]"].
26
27
28search(Elem) ->
29    search(Elem, []).
30
31search({<<"tr">>, [{<<"id">>, <<"entity-", _/binary>>} | _], Children}, Acc) ->
32    %% HTML5 charrefs can have more than one code point(!)
33    [{<<"td">>, _, [{<<"code">>, _, [TitleSemi]}]},
34     {<<"td">>, [], [RawCPs]} | _] = Children,
35    L = byte_size(TitleSemi) - 1,
36    <<Title:L/binary, $;>> = TitleSemi,
37    {match, Matches} = re:run(RawCPs, "(?:\\s*U\\+)([a-fA-F0-9]+)",
38                              [{capture, all, binary}, global]),
39    [{Title, [CP || [_, CP] <- Matches]} | Acc];
40search({Tag, Attrs, [H | T]}, Acc) ->
41    search({Tag, Attrs, T}, search(H, Acc));
42search({_Tag, _Attrs, []}, Acc) ->
43    Acc;
44search(<<_/binary>>, Acc) ->
45    Acc.