PageRenderTime 36ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/apprecommender/experiments/pure.py

https://gitlab.com/lucasmoura128/AppRecommender
Python | 223 lines | 209 code | 5 blank | 9 comment | 22 complexity | 999b2c81cb5245391071a9d88426227c MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. profile-suite - experiment different profile sizes
  4. """
  5. __author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6. __copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7. __license__ = """
  8. This program is free software: you can redistribute it and/or modify
  9. it under the terms of the GNU General Public License as published by
  10. the Free Software Foundation, either version 3 of the License, or
  11. (at your option) any later version.
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License
  17. along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. """
  19. import os
  20. import sys
  21. import random
  22. import Gnuplot
  23. import numpy
  24. sys.path.insert(0, '../')
  25. from config import Config
  26. from evaluation import Evaluation, Precision, F_score
  27. from recommender import Recommender, RecommendationResult
  28. from user import User, PopconSystem
  29. if __name__ == '__main__':
  30. if len(sys.argv) < 2:
  31. print "Usage: pure strategy_category sample_file"
  32. exit(1)
  33. iterations = 20
  34. profile_size = [10, 20, 40, 60, 80, 100, 140, 170, 200, 240]
  35. neighbor_size = [3, 5, 10, 20, 30, 50, 70, 100, 150, 200]
  36. content_strategies = [
  37. 'cb', 'cbt', 'cbd', 'cbh', 'cb_eset', 'cbt_eset',
  38. 'cbd_eset', 'cbh_eset']
  39. collaborative_strategies = ['knn_eset', 'knn', 'knn_plus']
  40. # iterations = 1
  41. # profile_size = [10,20,30]
  42. # neighbor_size = [3,5,10,20,30,50]
  43. # content_strategies = ['cb']
  44. # collaborative_strategies = ['knn']
  45. strategy_category = sys.argv[1]
  46. if strategy_category == "content":
  47. strategies = content_strategies
  48. sizes = profile_size
  49. option_str = "profile"
  50. elif strategy_category == "collaborative":
  51. strategies = collaborative_strategies
  52. sizes = neighbor_size
  53. option_str = "neighborhood"
  54. else:
  55. print "Usage: profile-suite strategy_category sample_file"
  56. exit(1)
  57. cfg = Config()
  58. population_sample = []
  59. sample_file = sys.argv[2]
  60. sample_str = sample_file.split('/')[-1]
  61. with open(sample_file, 'r') as f:
  62. for line in f.readlines():
  63. user_id = line.strip('\n')
  64. population_sample.append(
  65. os.path.join(cfg.popcon_dir, user_id[:2], user_id))
  66. sample_dir = ("results/%s/%s" %
  67. (strategy_category, sample_str))
  68. if not os.path.exists(sample_dir):
  69. os.makedirs(sample_dir)
  70. for strategy in strategies:
  71. cfg.strategy = strategy
  72. p_10_summary = {}
  73. f05_100_summary = {}
  74. c_10 = {}
  75. c_100 = {}
  76. log_file = os.path.join(sample_dir, sample_str + "-" + cfg.strategy)
  77. graph_10 = log_file + "-10.png"
  78. graph_100 = log_file + "-100.png"
  79. graph_10_jpg = graph_10.strip(".png") + ".jpg"
  80. graph_100_jpg = graph_100.strip(".png") + ".jpg"
  81. comment_10 = graph_10_jpg + ".comment"
  82. comment_100 = graph_100_jpg + ".comment"
  83. with open(comment_10, 'w') as f:
  84. f.write("# sample %s\n" % sample_str)
  85. f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
  86. (cfg.strategy, iterations))
  87. f.write("# %s\tmean_p_10\tdev_p_10\tc_10\n\n" % option_str)
  88. with open(comment_100, 'w') as f:
  89. f.write("# sample %s\n" % sample_str)
  90. f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
  91. (cfg.strategy, iterations))
  92. f.write("# %s\t\tmean_f05_100\t\tdev_f05_100\t\tc_100\n\n" %
  93. option_str)
  94. for size in sizes:
  95. c_10[size] = set()
  96. c_100[size] = set()
  97. p_10_summary[size] = []
  98. f05_100_summary[size] = []
  99. with open(log_file + "-%s%.3d" % (option_str, size), 'w') as f:
  100. f.write("# sample %s\n" % sample_str)
  101. f.write("# strategy %s-%s%.3d\n\n" %
  102. (cfg.strategy, option_str, size))
  103. f.write("# p_10\tf05_100\n\n")
  104. # main loop per user
  105. for submission_file in population_sample:
  106. user = PopconSystem(submission_file)
  107. user.filter_pkg_profile(cfg.pkgs_filter)
  108. user.maximal_pkg_profile()
  109. for size in sizes:
  110. cfg.profile_size = size
  111. cfg.k_neighbors = size
  112. rec = Recommender(cfg)
  113. repo_size = rec.items_repository.get_doccount()
  114. p_10 = []
  115. f05_100 = []
  116. for n in range(iterations):
  117. # Fill sample profile
  118. profile_len = len(user.pkg_profile)
  119. item_score = {}
  120. for pkg in user.pkg_profile:
  121. item_score[pkg] = user.item_score[pkg]
  122. sample = {}
  123. sample_size = int(profile_len * 0.9)
  124. for i in range(sample_size):
  125. key = random.choice(item_score.keys())
  126. sample[key] = item_score.pop(key)
  127. iteration_user = User(item_score)
  128. recommendation = rec.get_recommendation(
  129. iteration_user, repo_size)
  130. if hasattr(recommendation, "ranking"):
  131. ranking = recommendation.ranking
  132. real = RecommendationResult(sample)
  133. predicted_10 = RecommendationResult(
  134. dict.fromkeys(ranking[:10], 1))
  135. evaluation = Evaluation(predicted_10, real, repo_size)
  136. p_10.append(evaluation.run(Precision()))
  137. predicted_100 = RecommendationResult(
  138. dict.fromkeys(ranking[:100], 1))
  139. evaluation = Evaluation(predicted_100, real, repo_size)
  140. f05_100.append(evaluation.run(F_score(0.5)))
  141. c_10[size] = c_10[size].union(
  142. recommendation.ranking[:10])
  143. c_100[size] = c_100[size].union(
  144. recommendation.ranking[:100])
  145. # save summary
  146. if p_10:
  147. p_10_summary[size].append(numpy.mean(p_10))
  148. if f05_100:
  149. f05_100_summary[size].append(numpy.mean(f05_100))
  150. with open(log_file + "-%s%.3d" % (option_str, size), 'a') as f:
  151. f.write("%.4f \t%.4f\n" %
  152. (numpy.mean(p_10), numpy.mean(f05_100)))
  153. # back to main flow
  154. coverage_10 = {}
  155. coverage_100 = {}
  156. with open(comment_10, 'a') as f:
  157. for size in sizes:
  158. coverage_10[size] = len(c_10[size]) / float(repo_size)
  159. f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  160. (size, numpy.mean(p_10_summary[size]),
  161. numpy.std(p_10_summary[size]), coverage_10[size]))
  162. with open(comment_100, 'a') as f:
  163. for size in sizes:
  164. coverage_100[size] = len(c_100[size]) / float(repo_size)
  165. f.write("%3d\t\t%.4f\t\t%.4f\t\t%.4f\n" %
  166. (size, numpy.mean(f05_100_summary[size]),
  167. numpy.std(f05_100_summary[size]), coverage_100[size]))
  168. # plot results summary
  169. g = Gnuplot.Gnuplot()
  170. g('set style data lines')
  171. g('set yrange [0:1.0]')
  172. g.xlabel('%s size' % option_str.capitalize())
  173. g.title("Setup: %s (threshold 10)" % cfg.strategy)
  174. g.plot(Gnuplot.Data(sorted([[k, numpy.mean(p_10_summary[k]),
  175. numpy.std(p_10_summary[k])]
  176. for k in p_10_summary.keys(
  177. )]), title="Precision"),
  178. Gnuplot.Data(sorted([[k, numpy.mean(p_10_summary[k]),
  179. numpy.std(p_10_summary[k])]
  180. for k in p_10_summary.keys(
  181. )]), title="Deviation",
  182. with_="yerrorbar lt 2 pt 6"), Gnuplot.Data(
  183. sorted(
  184. [[k, coverage_10[k]]
  185. for k in coverage_10.keys()]), title="Coverage"))
  186. g.hardcopy(graph_10, terminal="png")
  187. g = Gnuplot.Gnuplot()
  188. g('set style data lines')
  189. g('set yrange [0:1.0]')
  190. g.xlabel('%s size' % option_str.capitalize())
  191. g.title("Setup: %s (threshold 100)" % cfg.strategy)
  192. g.plot(Gnuplot.Data(sorted([[k, numpy.mean(f05_100_summary[k]),
  193. numpy.std(f05_100_summary[k])]
  194. for k in f05_100_summary.keys(
  195. )]), title="F05"),
  196. Gnuplot.Data(sorted([[k, numpy.mean(f05_100_summary[k]),
  197. numpy.std(f05_100_summary[k])]
  198. for k in f05_100_summary.keys(
  199. )]), title="Deviation",
  200. with_="yerrorbar lt 2 pt 6"), Gnuplot.Data(
  201. sorted(
  202. [[k, coverage_100[k]]
  203. for k in coverage_100.keys()]), title="Coverage"))
  204. g.hardcopy(graph_100, terminal="png")