/jaerlang-code/code/lib_trigrams.erl

https://github.com/killme2008/erlib · Erlang · 152 lines · 101 code · 35 blank · 16 comment · 0 complexity · 09c0e29989e90d98aed9d947f1da8abc MD5 · raw file

  1. %% ---
  2. %% Excerpted from "Programming Erlang",
  3. %% published by The Pragmatic Bookshelf.
  4. %% Copyrights apply to this code. It may not be used to create training material,
  5. %% courses, books, articles, and the like. Contact us if you are in doubt.
  6. %% We make no guarantees that this code is fit for any purpose.
  7. %% Visit http://www.pragmaticprogrammer.com/titles/jaerlang for more book information.
  8. %%---
  9. -module(lib_trigrams).
  10. -export([for_each_trigram_in_the_english_language/2,
  11. make_tables/0, timer_tests/0,
  12. open/0, close/1, is_word/2,
  13. how_many_trigrams/0,
  14. make_ets_set/0, make_ets_ordered_set/0, make_mod_set/0,
  15. lookup_all_ets/2, lookup_all_set/2
  16. ]).
  17. -import(lists, [reverse/1]).
  18. make_tables() ->
  19. {Micro1, N} = timer:tc(?MODULE, how_many_trigrams, []),
  20. io:format("Counting - No of trigrams=~p time/trigram=~p~n",[N,Micro1/N]),
  21. {Micro2, Ntri} = timer:tc(?MODULE, make_ets_ordered_set, []),
  22. FileSize1 = filelib:file_size("trigramsOS.tab"),
  23. io:format("Ets ordered Set size=~p time/trigram=~p~n",[FileSize1/Ntri,
  24. Micro2/N]),
  25. {Micro3, _} = timer:tc(?MODULE, make_ets_set, []),
  26. FileSize2 = filelib:file_size("trigramsS.tab"),
  27. io:format("Ets set size=~p time/trigram=~p~n",[FileSize2/Ntri, Micro3/N]),
  28. {Micro4, _} = timer:tc(?MODULE, make_mod_set, []),
  29. FileSize3 = filelib:file_size("trigrams.set"),
  30. io:format("Module sets size=~p time/trigram=~p~n",[FileSize3/Ntri, Micro4/N]).
  31. make_ets_ordered_set() -> make_a_set(ordered_set, "trigramsOS.tab").
  32. make_ets_set() -> make_a_set(set, "trigramsS.tab").
  33. make_a_set(Type, FileName) ->
  34. Tab = ets:new(table, [Type]),
  35. F = fun(Str, _) -> ets:insert(Tab, {list_to_binary(Str)}) end,
  36. for_each_trigram_in_the_english_language(F, 0),
  37. ets:tab2file(Tab, FileName),
  38. Size = ets:info(Tab, size),
  39. ets:delete(Tab),
  40. Size.
  41. make_mod_set() ->
  42. D = sets:new(),
  43. F = fun(Str, Set) -> sets:add_element(list_to_binary(Str),Set) end,
  44. D1 = for_each_trigram_in_the_english_language(F, D),
  45. file:write_file("trigrams.set", [term_to_binary(D1)]).
  46. timer_tests() ->
  47. time_lookup_ets_set("Ets ordered Set", "trigramsOS.tab"),
  48. time_lookup_ets_set("Ets set", "trigramsS.tab"),
  49. time_lookup_module_sets().
  50. time_lookup_ets_set(Type, File) ->
  51. {ok, Tab} = ets:file2tab(File),
  52. L = ets:tab2list(Tab),
  53. Size = length(L),
  54. {M, _} = timer:tc(?MODULE, lookup_all_ets, [Tab, L]),
  55. io:format("~s lookup=~p micro seconds~n",[Type, M/Size]),
  56. ets:delete(Tab).
  57. lookup_all_ets(Tab, L) ->
  58. lists:foreach(fun({K}) -> ets:lookup(Tab, K) end, L).
  59. time_lookup_module_sets() ->
  60. {ok, Bin} = file:read_file("trigrams.set"),
  61. Set = binary_to_term(Bin),
  62. Keys = sets:to_list(Set),
  63. Size = length(Keys),
  64. {M, _} = timer:tc(?MODULE, lookup_all_set, [Set, Keys]),
  65. io:format("Module set lookup=~p micro seconds~n",[M/Size]).
  66. lookup_all_set(Set, L) ->
  67. lists:foreach(fun(Key) -> sets:is_element(Key, Set) end, L).
  68. how_many_trigrams() ->
  69. F = fun(_, N) -> 1 + N end,
  70. for_each_trigram_in_the_english_language(F, 0).
  71. %% An iterator that iterates through all trigrams in the language
  72. for_each_trigram_in_the_english_language(F, A0) ->
  73. {ok, Bin0} = file:read_file("354984si.ngl.gz"),
  74. Bin = zlib:gunzip(Bin0),
  75. scan_word_list(binary_to_list(Bin), F, A0).
  76. scan_word_list([], _, A) ->
  77. A;
  78. scan_word_list(L, F, A) ->
  79. {Word, L1} = get_next_word(L, []),
  80. A1 = scan_trigrams([$\s|Word], F, A),
  81. scan_word_list(L1, F, A1).
  82. %% scan the word looking for \r\n
  83. %% the second argument is the word (reversed) so it
  84. %% has to be reversed when we find \r\n or run out of characters
  85. get_next_word([$\r,$\n|T], L) -> {reverse([$\s|L]), T};
  86. get_next_word([H|T], L) -> get_next_word(T, [H|L]);
  87. get_next_word([], L) -> {reverse([$\s|L]), []}.
  88. scan_trigrams([X,Y,Z], F, A) ->
  89. F([X,Y,Z], A);
  90. scan_trigrams([X,Y,Z|T], F, A) ->
  91. A1 = F([X,Y,Z], A),
  92. scan_trigrams([Y,Z|T], F, A1);
  93. scan_trigrams(_, _, A) ->
  94. A.
  95. %% access routines
  96. %% open() -> Table
  97. %% close(Table)
  98. %% is_word(Table, String) -> Bool
  99. is_word(Tab, Str) -> is_word1(Tab, "\s" ++ Str ++ "\s").
  100. is_word1(Tab, [_,_,_]=X) -> is_this_a_trigram(Tab, X);
  101. is_word1(Tab, [A,B,C|D]) ->
  102. case is_this_a_trigram(Tab, [A,B,C]) of
  103. true -> is_word1(Tab, [B,C|D]);
  104. false -> false
  105. end;
  106. is_word1(_, _) ->
  107. false.
  108. is_this_a_trigram(Tab, X) ->
  109. case ets:lookup(Tab, list_to_binary(X)) of
  110. [] -> false;
  111. _ -> true
  112. end.
  113. open() ->
  114. {ok, I} = ets:file2tab(filename:dirname(code:which(?MODULE))
  115. ++ "/trigramsS.tab"),
  116. I.
  117. close(Tab) -> ets:delete(Tab).