PageRenderTime 44ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/fstmerge/examples/SpamBayes/rev3103-3133/spambayes/utilities/rebal.py

https://github.com/RoDaniel/featurehouse
Python | 321 lines | 277 code | 0 blank | 44 comment | 5 complexity | eee6b4414edaf35b66faa2024ade79e8 MD5 | raw file
  1. """
  2. rebal.py - rebalance a ham or spam test directory
  3. usage: rebal.py [ options ]
  4. options:
  5. -d - dry run; display what would be moved, but don't do it [%(DRYRUN)s]
  6. -n num - specify number of files per Set dir desired [%(NPERDIR)s]
  7. -t - top directory, holding Set and reservoir subdirs [%(TOPDIR)s]
  8. -v - tell user what's happening; opposite of -q [%(VERBOSE)s]
  9. -q - be quiet about what's happening; opposite of -v [not %(VERBOSE)s]
  10. -c - confirm file moves into Set directory; opposite of -Q [%(CONFIRM)s]
  11. -Q - don't confirm moves; opposite of -c; independent of -v/-q
  12. -h - display this message and quit
  13. If you have a non-standard test setup, you can use -r/-s instead of -t:
  14. -r res - specify an alternate reservoir [%(RESDIR)s]
  15. -s set - specify an alternate Set prefix [%(SETPREFIX)s]
  16. Moves files randomly among the Set subdirectories and a reservoir directory to
  17. leave -n files in each Set directory. By default, the Set1, Set2, ..., and
  18. reservoir subdirectories under (relative path) Data/Ham/ are rebalanced; this
  19. can be changed with the -t option. The script will work with a variable
  20. number of Set directories, but they must already exist, and the reservoir
  21. directory must also exist.
  22. It's recommended that you run with the -d (dry run) option first, to see what
  23. the script would do without actually moving any files. If, e.g., you
  24. accidentally mix up spam Sets with your Ham reservoir, it could be very
  25. difficult to recover from that mistake.
  26. See the module comments for examples.
  27. """
  28. import os
  29. import sys
  30. import random
  31. import glob
  32. import getopt
  33. try:
  34. True, False
  35. except NameError:
  36. True, False = 1, 0
  37. NPERDIR = 4000
  38. TOPDIR = os.path.join('Data', 'Ham')
  39. RESDIR = os.path.join(TOPDIR, 'reservoir')
  40. SETPREFIX = os.path.join(TOPDIR, 'Set')
  41. VERBOSE = True
  42. CONFIRM = True
  43. DRYRUN = False
  44. def usage(msg=None):
  45. if msg:
  46. print >> sys.stderr, str(msg)
  47. print >> sys.stderr
  48. print >> sys.stderr, __doc__ % globals()
  49. def migrate(f, targetdir, verbose):
  50. """Move f into targetdir, renaming if needed to avoid name clashes.
  51. The basename of the moved file is returned; this may not be the
  52. same as the basename of f, if the file had to be renamed because
  53. a file with f's basename already existed in targetdir.
  54. """
  55. base = os.path.basename(f)
  56. out = os.path.join(targetdir, base)
  57. while os.path.exists(out):
  58. basename, ext = os.path.splitext(base)
  59. digits = random.randrange(100000000)
  60. out = os.path.join(targetdir, str(digits) + ext)
  61. if verbose:
  62. print "moving", f, "to", out
  63. os.rename(f, out)
  64. return os.path.basename(out)
  65. def main(args):
  66. nperdir = NPERDIR
  67. verbose = VERBOSE
  68. confirm = CONFIRM
  69. dryrun = DRYRUN
  70. topdir = resdir = setprefix = None
  71. try:
  72. opts, args = getopt.getopt(args, "dr:s:t:n:vqcQh")
  73. except getopt.GetoptError, msg:
  74. usage(msg)
  75. return 1
  76. for opt, arg in opts:
  77. if opt == "-n":
  78. nperdir = int(arg)
  79. elif opt == "-t":
  80. topdir = arg
  81. elif opt == "-r":
  82. resdir = arg
  83. elif opt == "-s":
  84. setprefix = arg
  85. elif opt == "-v":
  86. verbose = True
  87. elif opt == "-c":
  88. confirm = True
  89. elif opt == "-q":
  90. verbose = False
  91. elif opt == "-Q":
  92. confirm = False
  93. elif opt == "-d":
  94. dryrun = True
  95. elif opt == "-h":
  96. usage()
  97. return 0
  98. else:
  99. raise SystemError("internal error on option '%s'" % opt)
  100. if topdir is not None:
  101. if resdir is not None or setprefix is not None:
  102. usage("-t can't be specified with -r or -s")
  103. return -1
  104. setprefix = os.path.join(topdir, "Set")
  105. resdir = os.path.join(topdir, "reservoir")
  106. else:
  107. if setprefix is None:
  108. setprefix = SETPREFIX
  109. if resdir is None:
  110. resdir = RESDIR
  111. if not os.path.exists(resdir):
  112. print >> sys.stderr, "reservoir directory %s doesn't exist" % resdir
  113. return 1
  114. res = os.listdir(resdir)
  115. dirs = glob.glob(setprefix + "*")
  116. if not dirs:
  117. print >> sys.stderr, "no directories starting with", setprefix, "exist."
  118. return 1
  119. stuff = []
  120. n = len(res)
  121. for d in dirs:
  122. fs = os.listdir(d)
  123. n += len(fs)
  124. stuff.append((d, fs))
  125. if nperdir * len(dirs) > n:
  126. print >> sys.stderr, "not enough files to go around - use lower -n."
  127. return 1
  128. if ((setprefix.find("Ham") >= 0 and resdir.find("Spam") >= 0) or
  129. (setprefix.find("Spam") >= 0 and resdir.find("Ham") >= 0)):
  130. yn = raw_input("Reservoir and Set dirs appear not to match. "
  131. "Continue? (y/n) ")
  132. if yn.lower()[0:1] != 'y':
  133. return 1
  134. for (d, fs) in stuff:
  135. if len(fs) <= nperdir:
  136. continue
  137. random.shuffle(fs)
  138. movethese = fs[nperdir:]
  139. del fs[nperdir:]
  140. if dryrun:
  141. print "would move", len(movethese), "files from", d, \
  142. "to reservoir", resdir
  143. res.extend(movethese)
  144. else:
  145. for f in movethese:
  146. newname = migrate(os.path.join(d, f), resdir, verbose)
  147. res.append(newname)
  148. random.shuffle(res)
  149. for (d, fs) in stuff:
  150. assert len(fs) <= nperdir
  151. if nperdir == len(fs):
  152. continue
  153. numtomove = nperdir - len(fs)
  154. assert 0 < numtomove <= len(res)
  155. movethese = res[-numtomove:]
  156. del res[-numtomove:]
  157. if dryrun:
  158. print "would move", len(movethese), "files from reservoir", \
  159. resdir, "to", d
  160. else:
  161. for f in movethese:
  162. if confirm:
  163. print file(os.path.join(resdir, f)).read()
  164. ok = raw_input('good enough? ').lower()
  165. if not ok.startswith('y'):
  166. continue
  167. migrate(os.path.join(resdir, f), d, verbose)
  168. return 0
  169. if __name__ == "__main__":
  170. sys.exit(main(sys.argv[1:]))
  171. if __name__ == "__main__":
  172. sys.exit(main(sys.argv[1:]))
  173. try:
  174. True, False
  175. except NameError:
  176. True, False = 1, 0