/jaerlang-code/code/indexer-1.1/indexer_trigrams.erl

https://github.com/killme2008/erlib · Erlang · 142 lines · 91 code · 35 blank · 16 comment · 1 complexity · dbceb4204034fb346c1ee6b3fabb94ca MD5 · raw file

  1. %% ---
  2. %% Excerpted from "Programming Erlang",
  3. %% published by The Pragmatic Bookshelf.
  4. %% Copyrights apply to this code. It may not be used to create training material,
  5. %% courses, books, articles, and the like. Contact us if you are in doubt.
  6. %% We make no guarantees that this code is fit for any purpose.
  7. %% Visit http://www.pragmaticprogrammer.com/titles/jaerlang for more book information.
  8. %%---
  9. -module(indexer_trigrams).
  10. -export([for_each_trigram_in_the_english_language/2,
  11. make_tables/0, timer_tests/0,
  12. open/0, close/1, is_word/2,
  13. howManyTrigrams/0,
  14. make_dict/0, make_ordered_set/0, makeSet/0,
  15. lookup_all_ets/2, lookup_all_dict/2
  16. ]).
  17. -import(lists, [reverse/1]).
  18. make_tables() ->
  19. io:format("Building trigrams -- make take some time~n"),
  20. makeSet().
  21. make_ordered_set() -> makeAset(ordered_set, "trigramsOS.tab").
  22. makeSet() -> makeAset(set, "trigramsS.tab").
  23. makeAset(Type, FileName) ->
  24. Tab = ets:new(table, [Type]),
  25. F = fun(Str, _) -> ets:insert(Tab, {list_to_binary(Str)}) end,
  26. for_each_trigram_in_the_english_language(F, 0),
  27. ets:tab2file(Tab, FileName),
  28. Size = ets:info(Tab, size),
  29. ets:delete(Tab),
  30. Size.
  31. make_dict() ->
  32. D = dict:new(),
  33. F = fun(Str, Dict) -> dict:store(list_to_binary(Str),[],Dict) end,
  34. D1 = for_each_trigram_in_the_english_language(F, D),
  35. file:write_file("trigrams.dict", [term_to_binary(D1)]).
  36. timer_tests() ->
  37. time_lookup_set("Ordered Set", "trigramsOS.tab"),
  38. time_lookup_set("Set", "trigramsS.tab"),
  39. time_lookup_dict().
  40. time_lookup_set(Type, File) ->
  41. {ok, Tab} = ets:file2tab(File),
  42. L = ets:tab2list(Tab),
  43. Size = length(L),
  44. {M, _} = timer:tc(?MODULE, lookup_all_ets, [Tab, L]),
  45. io:format("~s lookup=~p micro seconds~n",[Type, M/Size]),
  46. ets:delete(Tab).
  47. lookup_all_ets(Tab, L) ->
  48. lists:foreach(fun({K}) -> ets:lookup(Tab, K) end, L).
  49. time_lookup_dict() ->
  50. {ok, Bin} = file:read_file("trigrams.dict"),
  51. Dict = binary_to_term(Bin),
  52. Keys = [Key || {Key,_} <- dict:to_list(Dict)],
  53. Size = length(Keys),
  54. {M, _} = timer:tc(?MODULE, lookup_all_dict, [Dict, Keys]),
  55. io:format("Dict lookup=~p micro seconds~n",[M/Size]).
  56. lookup_all_dict(Dict, L) ->
  57. lists:foreach(fun(Key) -> dict:find(Key, Dict) end, L).
  58. howManyTrigrams() ->
  59. F = fun(_, N) -> 1 + N end,
  60. for_each_trigram_in_the_english_language(F, 0).
  61. %% An iterator that iterates through all trigrams in the language
  62. for_each_trigram_in_the_english_language(F, A0) ->
  63. {ok, Bin0} = file:read_file("../354984si.ngl.gz"),
  64. Bin = zlib:gunzip(Bin0),
  65. scan_word_list(binary_to_list(Bin), F, A0).
  66. scan_word_list([], _, A) ->
  67. A;
  68. scan_word_list(L, F, A) ->
  69. {Word, L1} = get_next_word(L, []),
  70. A1 = scan_trigrams([$\s|Word], F, A),
  71. scan_word_list(L1, F, A1).
  72. %% scan the word looking for \r\n
  73. %% the second argument is the word (reversed) so it
  74. %% has to be reversed when we find \r\n or run out of characters
  75. get_next_word([$\r,$\n|T], L) -> {reverse([$\s|L]), T};
  76. get_next_word([H|T], L) -> get_next_word(T, [H|L]);
  77. get_next_word([], L) -> {reverse([$\s|L]), []}.
  78. scan_trigrams([X,Y,Z], F, A) ->
  79. F([X,Y,Z], A);
  80. scan_trigrams([X,Y,Z|T], F, A) ->
  81. A1 = F([X,Y,Z], A),
  82. scan_trigrams([Y,Z|T], F, A1);
  83. scan_trigrams(_, _, A) ->
  84. A.
  85. %% access routines
  86. %% open() -> Table
  87. %% close(Table)
  88. %% is_word(Table, String) -> Bool
  89. is_word(Tab, Str) -> is_word1(Tab, "\s" ++ Str ++ "\s").
  90. is_word1(Tab, [_,_,_]=X) -> is_this_a_trigram(Tab, X);
  91. is_word1(Tab, [A,B,C|D]) ->
  92. case is_this_a_trigram(Tab, [A,B,C]) of
  93. true -> is_word1(Tab, [B,C|D]);
  94. false -> false
  95. end;
  96. is_word1(_, _) ->
  97. false.
  98. is_this_a_trigram(Tab, X) ->
  99. case ets:lookup(Tab, list_to_binary(X)) of
  100. [] -> false;
  101. _ -> true
  102. end.
  103. open() ->
  104. {ok, I} = ets:file2tab(filename:dirname(code:which(?MODULE))
  105. ++ "/trigramsS.tab"),
  106. I.
  107. close(Tab) -> ets:delete(Tab).