PageRenderTime 30ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/python-debian-0.1.21/lib/debian/debtags.py

#
Python | 505 lines | 426 code | 24 blank | 55 comment | 21 complexity | bbce07f91532e88c5daedc0c992f3128 MD5 | raw file
  1. # debtags.py -- Access and manipulate Debtags information
  2. # Copyright (C) 2006-2007 Enrico Zini <enrico@enricozini.org>
  3. #
  4. # This program is free software: you can redistribute it and/or modify
  5. # it under the terms of the GNU General Public License as published by
  6. # the Free Software Foundation, either version 3 of the License, or
  7. # (at your option) any later version.
  8. #
  9. # This program is distributed in the hope that it will be useful, but
  10. # WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. # General Public License for more details.
  13. #
  14. # You should have received a copy of the GNU General Public License
  15. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. import re, cPickle
  17. from deprecation import function_deprecated_by
  18. def parse_tags(input):
  19. lre = re.compile(r"^(.+?)(?::?\s*|:\s+(.+?)\s*)$")
  20. for line in input:
  21. # Is there a way to remove the last character of a line that does not
  22. # make a copy of the entire line?
  23. m = lre.match(line)
  24. pkgs = set(m.group(1).split(', '))
  25. if m.group(2):
  26. tags = set(m.group(2).split(', '))
  27. else:
  28. tags = set()
  29. yield pkgs, tags
  30. parseTags = function_deprecated_by(parse_tags)
  31. def read_tag_database(input):
  32. "Read the tag database, returning a pkg->tags dictionary"
  33. db = {}
  34. for pkgs, tags in parse_tags(input):
  35. # Create the tag set using the native set
  36. for p in pkgs:
  37. db[p] = tags.copy()
  38. return db;
  39. readTagDatabase = function_deprecated_by(read_tag_database)
  40. def read_tag_database_reversed(input):
  41. "Read the tag database, returning a tag->pkgs dictionary"
  42. db = {}
  43. for pkgs, tags in parse_tags(input):
  44. # Create the tag set using the native set
  45. for tag in tags:
  46. if db.has_key(tag):
  47. db[tag] |= pkgs
  48. else:
  49. db[tag] = pkgs.copy()
  50. return db;
  51. readTagDatabaseReversed = function_deprecated_by(read_tag_database_reversed)
  52. def read_tag_database_both_ways(input, tag_filter = None):
  53. "Read the tag database, returning a pkg->tags and a tag->pkgs dictionary"
  54. db = {}
  55. dbr = {}
  56. for pkgs, tags in parse_tags(input):
  57. # Create the tag set using the native set
  58. if tag_filter == None:
  59. tags = set(tags)
  60. else:
  61. tags = set(filter(tag_filter, tags))
  62. for pkg in pkgs:
  63. db[pkg] = tags.copy()
  64. for tag in tags:
  65. if dbr.has_key(tag):
  66. dbr[tag] |= pkgs
  67. else:
  68. dbr[tag] = pkgs.copy()
  69. return db, dbr;
  70. readTagDatabaseBothWays = function_deprecated_by(read_tag_database_both_ways)
  71. def reverse(db):
  72. "Reverse a tag database, from package -> tags to tag->packages"
  73. res = {}
  74. for pkg, tags in db.items():
  75. for tag in tags:
  76. if not res.has_key(tag):
  77. res[tag] = set()
  78. res[tag].add(pkg)
  79. return res
  80. def output(db):
  81. "Write the tag database"
  82. for pkg, tags in db.items():
  83. # Using % here seems awkward to me, but if I use calls to
  84. # sys.stdout.write it becomes a bit slower
  85. print "%s:" % (pkg), ", ".join(tags)
  86. def relevance_index_function(full, sub):
  87. #return (float(sub.card(tag)) / float(sub.tag_count())) / \
  88. # (float(full.card(tag)) / float(full.tag_count()))
  89. #return sub.card(tag) * full.card(tag) / sub.tag_count()
  90. # New cardinality divided by the old cardinality
  91. #return float(sub.card(tag)) / float(full.card(tag))
  92. ## Same as before, but weighted by the relevance the tag had in the
  93. ## full collection, to downplay the importance of rare tags
  94. #return float(sub.card(tag) * full.card(tag)) / float(full.card(tag) * full.tag_count())
  95. # Simplified version:
  96. #return float(sub.card(tag)) / float(full.tag_count())
  97. # Weighted by the square root of the relevance, to downplay the very
  98. # common tags a bit
  99. #return lambda tag: float(sub.card(tag)) / float(full.card(tag)) * math.sqrt(full.card(tag) / float(full.tag_count()))
  100. #return lambda tag: float(sub.card(tag)) / float(full.card(tag)) * math.sqrt(full.card(tag) / float(full.package_count()))
  101. # One useless factor removed, and simplified further, thanks to Benjamin Mesing
  102. return lambda tag: float(sub.card(tag)**2) / float(full.card(tag))
  103. # The difference between how many packages are in and how many packages are out
  104. # (problems: tags that mean many different things can be very much out
  105. # as well. In the case of 'image editor', for example, there will be
  106. # lots of editors not for images in the outside group.
  107. # It is very, very good for nonambiguous keywords like 'image'.
  108. #return lambda tag: 2 * sub.card(tag) - full.card(tag)
  109. # Same but it tries to downplay the 'how many are out' value in the
  110. # case of popular tags, to mitigate the 'there will always be popular
  111. # tags left out' cases. Does not seem to be much of an improvement.
  112. #return lambda tag: sub.card(tag) - float(full.card(tag) - sub.card(tag))/(math.sin(float(full.card(tag))*3.1415/full.package_count())/4 + 0.75)
  113. relevanceIndexFunction = function_deprecated_by(relevance_index_function)
  114. class DB:
  115. """
  116. In-memory database mapping packages to tags and tags to packages.
  117. """
  118. def __init__(self):
  119. self.db = {}
  120. self.rdb = {}
  121. def read(self, input, tag_filter=None):
  122. """
  123. Read the database from a file.
  124. Example::
  125. # Read the system Debtags database
  126. db.read(open("/var/lib/debtags/package-tags", "r"))
  127. """
  128. self.db, self.rdb = read_tag_database_both_ways(input, tag_filter)
  129. def qwrite(self, file):
  130. "Quickly write the data to a pickled file"
  131. cPickle.dump(self.db, file)
  132. cPickle.dump(self.rdb, file)
  133. def qread(self, file):
  134. "Quickly read the data from a pickled file"
  135. self.db = cPickle.load(file)
  136. self.rdb = cPickle.load(file)
  137. def insert(self, pkg, tags):
  138. self.db[pkg] = tags.copy()
  139. for tag in tags:
  140. if self.rdb.has_key(tag):
  141. self.rdb[tag].add(pkg)
  142. else:
  143. self.rdb[tag] = set((pkg))
  144. def dump(self):
  145. output(self.db)
  146. def dump_reverse(self):
  147. output(self.rdb)
  148. dumpReverse = function_deprecated_by(dump_reverse)
  149. def reverse(self):
  150. "Return the reverse collection, sharing tagsets with this one"
  151. res = DB()
  152. res.db = self.rdb
  153. res.rdb = self.db
  154. return res
  155. def facet_collection(self):
  156. """
  157. Return a copy of this collection, but replaces the tag names
  158. with only their facets.
  159. """
  160. fcoll = DB()
  161. tofacet = re.compile(r"^([^:]+).+")
  162. for pkg, tags in self.iter_packagesTags():
  163. ftags = set([tofacet.sub(r"\1", t) for t in tags])
  164. fcoll.insert(pkg, ftags)
  165. return fcoll
  166. facetCollection = function_deprecated_by(facet_collection)
  167. def copy(self):
  168. """
  169. Return a copy of this collection, with the tagsets copied as
  170. well.
  171. """
  172. res = DB()
  173. res.db = self.db.copy()
  174. res.rdb = self.rdb.copy()
  175. return res
  176. def reverse_copy(self):
  177. """
  178. Return the reverse collection, with a copy of the tagsets of
  179. this one.
  180. """
  181. res = DB()
  182. res.db = self.rdb.copy()
  183. res.rdb = self.db.copy()
  184. return res
  185. reverseCopy = function_deprecated_by(reverse_copy)
  186. def choose_packages(self, package_iter):
  187. """
  188. Return a collection with only the packages in package_iter,
  189. sharing tagsets with this one
  190. """
  191. res = DB()
  192. db = {}
  193. for pkg in package_iter:
  194. if self.db.has_key(pkg): db[pkg] = self.db[pkg]
  195. res.db = db
  196. res.rdb = reverse(db)
  197. return res
  198. choosePackages = function_deprecated_by(choose_packages)
  199. def choose_packages_copy(self, package_iter):
  200. """
  201. Return a collection with only the packages in package_iter,
  202. with a copy of the tagsets of this one
  203. """
  204. res = DB()
  205. db = {}
  206. for pkg in package_iter:
  207. db[pkg] = self.db[pkg]
  208. res.db = db
  209. res.rdb = reverse(db)
  210. return res
  211. choosePackagesCopy = function_deprecated_by(choose_packages_copy)
  212. def filter_packages(self, package_filter):
  213. """
  214. Return a collection with only those packages that match a
  215. filter, sharing tagsets with this one. The filter will match
  216. on the package.
  217. """
  218. res = DB()
  219. db = {}
  220. for pkg in filter(package_filter, self.db.iterkeys()):
  221. db[pkg] = self.db[pkg]
  222. res.db = db
  223. res.rdb = reverse(db)
  224. return res
  225. filterPackages = function_deprecated_by(filter_packages)
  226. def filter_packages_copy(self, filter):
  227. """
  228. Return a collection with only those packages that match a
  229. filter, with a copy of the tagsets of this one. The filter
  230. will match on the package.
  231. """
  232. res = DB()
  233. db = {}
  234. for pkg in filter(filter, self.db.iterkeys()):
  235. db[pkg] = self.db[pkg].copy()
  236. res.db = db
  237. res.rdb = reverse(db)
  238. return res
  239. filterPackagesCopy = function_deprecated_by(filter_packages_copy)
  240. def filter_packages_tags(self, package_tag_filter):
  241. """
  242. Return a collection with only those packages that match a
  243. filter, sharing tagsets with this one. The filter will match
  244. on (package, tags).
  245. """
  246. res = DB()
  247. db = {}
  248. for pkg, tags in filter(package_tag_filter, self.db.iteritems()):
  249. db[pkg] = self.db[pkg]
  250. res.db = db
  251. res.rdb = reverse(db)
  252. return res
  253. filterPackagesTags = function_deprecated_by(filter_packages_tags)
  254. def filter_packages_tags_copy(self, package_tag_filter):
  255. """
  256. Return a collection with only those packages that match a
  257. filter, with a copy of the tagsets of this one. The filter
  258. will match on (package, tags).
  259. """
  260. res = DB()
  261. db = {}
  262. for pkg, tags in filter(package_tag_filter, self.db.iteritems()):
  263. db[pkg] = self.db[pkg].copy()
  264. res.db = db
  265. res.rdb = reverse(db)
  266. return res
  267. filterPackagesTagsCopy = function_deprecated_by(filter_packages_tags_copy)
  268. def filter_tags(self, tag_filter):
  269. """
  270. Return a collection with only those tags that match a
  271. filter, sharing package sets with this one. The filter will match
  272. on the tag.
  273. """
  274. res = DB()
  275. rdb = {}
  276. for tag in filter(tag_filter, self.rdb.iterkeys()):
  277. rdb[tag] = self.rdb[tag]
  278. res.rdb = rdb
  279. res.db = reverse(rdb)
  280. return res
  281. filterTags = function_deprecated_by(filter_tags)
  282. def filter_tags_copy(self, tag_filter):
  283. """
  284. Return a collection with only those tags that match a
  285. filter, with a copy of the package sets of this one. The
  286. filter will match on the tag.
  287. """
  288. res = DB()
  289. rdb = {}
  290. for tag in filter(tag_filter, self.rdb.iterkeys()):
  291. rdb[tag] = self.rdb[tag].copy()
  292. res.rdb = rdb
  293. res.db = reverse(rdb)
  294. return res
  295. filterTagsCopy = function_deprecated_by(filter_tags_copy)
  296. def has_package(self, pkg):
  297. """Check if the collection contains the given package"""
  298. return self.db.has_key(pkg)
  299. hasPackage = function_deprecated_by(has_package)
  300. def has_tag(self, tag):
  301. """Check if the collection contains packages tagged with tag"""
  302. return self.rdb.has_key(tag)
  303. hasTag = function_deprecated_by(has_tag)
  304. def tags_of_package(self, pkg):
  305. """Return the tag set of a package"""
  306. return self.db.has_key(pkg) and self.db[pkg] or set()
  307. tagsOfPackage = function_deprecated_by(tags_of_package)
  308. def packages_of_tag(self, tag):
  309. """Return the package set of a tag"""
  310. return self.rdb.has_key(tag) and self.rdb[tag] or set()
  311. packagesOfTag = function_deprecated_by(packages_of_tag)
  312. def tags_of_packages(self, pkgs):
  313. """Return the set of tags that have all the packages in pkgs"""
  314. res = None
  315. for p in pkgs:
  316. if res == None:
  317. res = set(self.tags_of_package(p))
  318. else:
  319. res &= self.tags_of_package(p)
  320. return res
  321. tagsOfPackages = function_deprecated_by(tags_of_packages)
  322. def packages_of_tags(self, tags):
  323. """Return the set of packages that have all the tags in tags"""
  324. res = None
  325. for t in tags:
  326. if res == None:
  327. res = set(self.packages_of_tag(t))
  328. else:
  329. res &= self.packages_of_tag(t)
  330. return res
  331. packagesOfTags = function_deprecated_by(packages_of_tags)
  332. def card(self, tag):
  333. """
  334. Return the cardinality of a tag
  335. """
  336. return self.rdb.has_key(tag) and len(self.rdb[tag]) or 0
  337. def discriminance(self, tag):
  338. """
  339. Return the discriminance index if the tag.
  340. Th discriminance index of the tag is defined as the minimum
  341. number of packages that would be eliminated by selecting only
  342. those tagged with this tag or only those not tagged with this
  343. tag.
  344. """
  345. n = self.card(tag)
  346. tot = self.package_count()
  347. return min(n, tot - n)
  348. def iter_packages(self):
  349. """Iterate over the packages"""
  350. return self.db.iterkeys()
  351. iterPackages = function_deprecated_by(iter_packages)
  352. def iter_tags(self):
  353. """Iterate over the tags"""
  354. return self.rdb.iterkeys()
  355. iterTags = function_deprecated_by(iter_tags)
  356. def iter_packages_tags(self):
  357. """Iterate over 2-tuples of (pkg, tags)"""
  358. return self.db.iteritems()
  359. iterPackagesTags = function_deprecated_by(iter_packages_tags)
  360. def iter_tags_packages(self):
  361. """Iterate over 2-tuples of (tag, pkgs)"""
  362. return self.rdb.iteritems()
  363. iterTagsPackages = function_deprecated_by(iter_tags_packages)
  364. def package_count(self):
  365. """Return the number of packages"""
  366. return len(self.db)
  367. packageCount = function_deprecated_by(package_count)
  368. def tag_count(self):
  369. """Return the number of tags"""
  370. return len(self.rdb)
  371. tagCount = function_deprecated_by(tag_count)
  372. def ideal_tagset(self, tags):
  373. """
  374. Return an ideal selection of the top tags in a list of tags.
  375. Return the tagset made of the highest number of tags taken in
  376. consecutive sequence from the beginning of the given vector,
  377. that would intersecate with the tagset of a comfortable amount
  378. of packages.
  379. Comfortable is defined in terms of how far it is from 7.
  380. """
  381. # TODO: the scoring function is quite ok, but may need more
  382. # tuning. I also center it on 15 instead of 7 since we're
  383. # setting a starting point for the search, not a target point
  384. def score_fun(x):
  385. return float((x-15)*(x-15))/x
  386. hits = []
  387. tagset = set()
  388. min_score = 3
  389. for i in range(len(tags)):
  390. pkgs = self.packages_of_tags(tags[:i+1])
  391. card = len(pkgs)
  392. if card == 0: break;
  393. score = score_fun(card)
  394. if score < min_score:
  395. min_score = score
  396. tagset = set(tags[:i+1])
  397. # Return always at least the first tag
  398. if len(tagset) == 0:
  399. return set(tags[:1])
  400. else:
  401. return tagset
  402. idealTagset = function_deprecated_by(ideal_tagset)
  403. def correlations(self):
  404. """
  405. Generate the list of correlation as a tuple (hastag, hasalsotag, score).
  406. Every touple will indicate that the tag 'hastag' tends to also
  407. have 'hasalsotag' with a score of 'score'.
  408. """
  409. for pivot in self.iter_tags():
  410. with_ = self.filter_packages_tags(lambda pt: pivot in pt[1])
  411. without = self.filter_packages_tags(lambda pt: pivot not in pt[1])
  412. for tag in with_.iter_tags():
  413. if tag == pivot: continue
  414. has = float(with_.card(tag)) / float(with_.package_count())
  415. hasnt = float(without.card(tag)) / float(without.package_count())
  416. yield pivot, tag, has - hasnt