PageRenderTime 71ms CodeModel.GetById 0ms RepoModel.GetById 1ms app.codeStats 0ms

/apprecommender/experiments/hybrid.py

https://gitlab.com/GCS2016/AppRecommender
Python | 227 lines | 211 code | 7 blank | 9 comment | 25 complexity | 2dc03c1938cb94951fc232a56c472280 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. hybrid-suite
  4. """
  5. __author__ = "Tassia Camoes Araujo <tassia@gmail.com>"
  6. __copyright__ = "Copyright (C) 2011 Tassia Camoes Araujo"
  7. __license__ = """
  8. This program is free software: you can redistribute it and/or modify
  9. it under the terms of the GNU General Public License as published by
  10. the Free Software Foundation, either version 3 of the License, or
  11. (at your option) any later version.
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License
  17. along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. """
  19. import Gnuplot
  20. import numpy
  21. import os
  22. import random
  23. import sys
  24. sys.path.insert(0, '../')
  25. from config import Config
  26. from recommender import Recommender, RecommendationResult
  27. from evaluation import Evaluation, Precision, F_score
  28. from user import User, PopconSystem
  29. # hybrid_strategies = ['knnco','knnco_eset']
  30. if __name__ == '__main__':
  31. if len(sys.argv) < 2:
  32. print "Usage: hybrid strategy sample_file"
  33. exit(1)
  34. iterations = 20
  35. profile_size = [10, 40, 70, 100, 170, 240]
  36. neighbor_size = [3, 10, 50, 70, 100, 150, 200]
  37. # iterations = 1
  38. # profile_size = [10,20,30]
  39. # neighbor_size = [10,20,30]
  40. cfg = Config()
  41. population_sample = []
  42. strategy = sys.argv[1]
  43. sample_file = sys.argv[2]
  44. sample_str = sample_file.split('/')[-1]
  45. with open(sample_file, 'r') as f:
  46. for line in f.readlines():
  47. user_id = line.strip('\n')
  48. population_sample.append(
  49. os.path.join(cfg.popcon_dir, user_id[:2], user_id))
  50. sample_dir = ("results/hybrid/%s/%s" % (sample_str, strategy))
  51. if not os.path.exists(sample_dir):
  52. os.makedirs(sample_dir)
  53. cfg.strategy = strategy
  54. p_10_summary = {}
  55. f05_100_summary = {}
  56. c_10 = {}
  57. c_100 = {}
  58. log_file = os.path.join(sample_dir, sample_str + "-" + cfg.strategy)
  59. graph_10 = {}
  60. graph_100 = {}
  61. graph_10_jpg = {}
  62. graph_100_jpg = {}
  63. comment_10 = {}
  64. comment_100 = {}
  65. for k in neighbor_size:
  66. graph_10[k] = log_file + ("-neighborhood%.3d-010.png" % k)
  67. graph_100[k] = log_file + ("-neighborhood%.3d-100.png" % k)
  68. graph_10_jpg[k] = graph_10[k].strip(".png") + ".jpg"
  69. graph_100_jpg[k] = graph_100[k].strip(".png") + ".jpg"
  70. comment_10[k] = graph_10_jpg[k] + ".comment"
  71. comment_100[k] = graph_100_jpg[k] + ".comment"
  72. with open(comment_10[k], 'w') as f:
  73. f.write("# %s\n" % sample_str)
  74. f.write("# strategy %s\n# threshold 10\n# iterations %d\n\n" %
  75. (cfg.strategy, iterations))
  76. f.write("# neighborhood\tprofile\tmean_p_10\tdev_p_10\tc_10\n\n")
  77. with open(comment_100[k], 'w') as f:
  78. f.write("# %s\n" % sample_str)
  79. f.write("# strategy %s\n# threshold 100\n# iterations %d\n\n" %
  80. (cfg.strategy, iterations))
  81. f.write(
  82. '# neighborhood\tprofile\tmean_f05_100\tdev_f05_100\tc_100\n\n') # noqa
  83. c_10[k] = {}
  84. c_100[k] = {}
  85. p_10_summary[k] = {}
  86. f05_100_summary[k] = {}
  87. for size in profile_size:
  88. c_10[k][size] = set()
  89. c_100[k][size] = set()
  90. p_10_summary[k][size] = []
  91. f05_100_summary[k][size] = []
  92. with open(log_file + "-neighborhood%.3d-profile%.3d" % (k, size),
  93. 'w') as f:
  94. f.write("# %s\n" % sample_str)
  95. f.write("# strategy %s-neighborhood%.3d-profile%.3d\n\n" %
  96. (cfg.strategy, k, size))
  97. f.write("# p_10\t\tf05_100\n\n")
  98. # main loop per user
  99. for submission_file in population_sample:
  100. user = PopconSystem(submission_file)
  101. user.filter_pkg_profile(cfg.pkgs_filter)
  102. user.maximal_pkg_profile()
  103. for k in neighbor_size:
  104. cfg.k_neighbors = k
  105. for size in profile_size:
  106. cfg.profile_size = size
  107. rec = Recommender(cfg)
  108. repo_size = rec.items_repository.get_doccount()
  109. p_10 = []
  110. f05_100 = []
  111. for n in range(iterations):
  112. # Fill sample profile
  113. profile_len = len(user.pkg_profile)
  114. item_score = {}
  115. for pkg in user.pkg_profile:
  116. item_score[pkg] = user.item_score[pkg]
  117. sample = {}
  118. sample_size = int(profile_len * 0.9)
  119. for i in range(sample_size):
  120. key = random.choice(item_score.keys())
  121. sample[key] = item_score.pop(key)
  122. iteration_user = User(item_score)
  123. recommendation = rec.get_recommendation(
  124. iteration_user, repo_size)
  125. if hasattr(recommendation, "ranking"):
  126. ranking = recommendation.ranking
  127. real = RecommendationResult(sample)
  128. predicted_10 = RecommendationResult(
  129. dict.fromkeys(ranking[:10], 1))
  130. evaluation = Evaluation(predicted_10, real, repo_size)
  131. p_10.append(evaluation.run(Precision()))
  132. predicted_100 = RecommendationResult(
  133. dict.fromkeys(ranking[:100], 1))
  134. evaluation = Evaluation(predicted_100, real, repo_size)
  135. f05_100.append(evaluation.run(F_score(0.5)))
  136. c_10[k][size] = c_10[k][size].union(
  137. recommendation.ranking[:10])
  138. c_100[k][size] = c_100[k][size].union(
  139. recommendation.ranking[:100])
  140. # save summary
  141. if p_10:
  142. p_10_summary[k][size].append(numpy.mean(p_10))
  143. if f05_100:
  144. f05_100_summary[k][size].append(numpy.mean(f05_100))
  145. path = log_file + "-neighborhood%.3d-profile%.3d" % (k, size)
  146. with open(path, 'a') as f:
  147. f.write("%.4f\t\t%.4f\n" %
  148. (numpy.mean(p_10), numpy.mean(f05_100)))
  149. # back to main flow
  150. coverage_10 = {}
  151. coverage_100 = {}
  152. for k in neighbor_size:
  153. coverage_10[k] = {}
  154. coverage_100[k] = {}
  155. with open(comment_10[k], 'a') as f:
  156. for size in profile_size:
  157. coverage_10[k][size] = len(c_10[k][size]) / float(repo_size)
  158. f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
  159. (k, size, numpy.mean(p_10_summary[k][size]),
  160. numpy.std(p_10_summary[k][size]),
  161. coverage_10[k][size]))
  162. with open(comment_100[k], 'a') as f:
  163. for size in profile_size:
  164. coverage_100[k][size] = len(c_100[k][size]) / float(repo_size)
  165. f.write("%3d\t\t%3d\t\t%.4f\t%.4f\t%.4f\n" %
  166. (k, size, numpy.mean(f05_100_summary[k][size]),
  167. numpy.std(f05_100_summary[k][size]),
  168. coverage_100[k][size]))
  169. for k in neighbor_size:
  170. # plot results summary
  171. g = Gnuplot.Gnuplot()
  172. g('set style data lines')
  173. g('set yrange [0:1.0]')
  174. g.xlabel('Profile size')
  175. g.title("Setup: %s-neighborhood%3d (threshold 10)" % (cfg.strategy, k))
  176. g.plot(Gnuplot.Data(sorted([[i, numpy.mean(p_10_summary[k][i]),
  177. numpy.std(p_10_summary[k][i])]
  178. for i in p_10_summary[k].keys(
  179. )]), title="Precision"),
  180. Gnuplot.Data(sorted([[i, numpy.mean(p_10_summary[k][i]),
  181. numpy.std(p_10_summary[k][i])]
  182. for i in p_10_summary[k].keys(
  183. )]), title="Deviation",
  184. with_="yerrorbar lt 2 pt 6"), Gnuplot.Data(
  185. sorted(
  186. [[i, coverage_10[k][i]] for i in coverage_10[k].keys()]),
  187. title="Coverage"))
  188. g.hardcopy(graph_10[k], terminal="png")
  189. g = Gnuplot.Gnuplot()
  190. g('set style data lines')
  191. g('set yrange [0:1.0]')
  192. g.xlabel('Profile size')
  193. g.title("Setup: %s-neighborhood%3d (threshold 100)" %
  194. (cfg.strategy, k))
  195. g.plot(Gnuplot.Data(sorted([[i, numpy.mean(f05_100_summary[k][i]),
  196. numpy.std(f05_100_summary[k][i])]
  197. for i in f05_100_summary[k].keys(
  198. )]), title="F05"),
  199. Gnuplot.Data(sorted([[i, numpy.mean(f05_100_summary[k][i]),
  200. numpy.std(f05_100_summary[k][i])]
  201. for i in f05_100_summary[k].keys(
  202. )]), title="Deviation",
  203. with_="yerrorbar lt 2 pt 6"), Gnuplot.Data(
  204. sorted(
  205. [[i, coverage_100[k][i]] for i in coverage_100[k].keys()]),
  206. title="Coverage"))
  207. g.hardcopy(graph_100[k], terminal="png")