PageRenderTime 47ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/fstmerge/examples/SpamBayes/rev3250-3267/right-branch-3267/testtools/sort+group.py

https://github.com/RoDaniel/featurehouse
Python | 124 lines | 107 code | 0 blank | 17 comment | 8 complexity | 220f341d3dbc97a7a5d22dd29fe83701 MD5 | raw file
  1. """Usage: sort+group.py [options]
  2. Where:
  3. -h
  4. Show usage and exit.
  5. -q
  6. Suppress verbose output.
  7. -a
  8. Run through all directories in the directories that the
  9. ham_directories and spam_directories are in. This is
  10. similar (identical with default ham/spam directories)
  11. to the 1.0.x sort+group.py behaviour.
  12. -o section:option:value
  13. set [section, option] in the options database to value.
  14. Sort and group the messages in the Data hierarchy.
  15. Run this prior to mksets.py for setting stuff up for testing of
  16. chronological incremental training.
  17. """
  18. import sys
  19. import os
  20. import glob
  21. import time
  22. import getopt
  23. from email.Utils import parsedate_tz, mktime_tz
  24. from spambayes.Options import options
  25. SECONDS_PER_DAY = 24 * 60 * 60
  26. def get_time(fpath):
  27. fh = file(fpath, 'rb')
  28. lines = iter(fh)
  29. for line in lines:
  30. if line.lower().startswith("received:"):
  31. break
  32. else:
  33. print("\nNo Received header found.")
  34. fh.close()
  35. return None
  36. received = line
  37. for line in lines:
  38. if line[0] in ' \t':
  39. received += line
  40. else:
  41. break
  42. fh.close()
  43. i = received.rfind(';')
  44. if i < 0:
  45. print("\n" + received)
  46. print("No semicolon found in Received header.")
  47. return None
  48. datestring = received[i+1:]
  49. datestring = ' '.join(datestring.split())
  50. as_tuple = parsedate_tz(datestring)
  51. if as_tuple is None:
  52. print("\n" + received)
  53. print("Couldn't parse the date: %r" % datestring)
  54. return None
  55. return mktime_tz(as_tuple)
  56. def usage(code, msg=''):
  57. """Print usage message and sys.exit(code)."""
  58. if msg:
  59. print(msg, file=sys.stderr)
  60. print(file=sys.stderr)
  61. print(__doc__ % globals(), file=sys.stderr)
  62. sys.exit(code)
  63. def main():
  64. """Main program; parse options and go."""
  65. from os.path import join, split
  66. import getopt
  67. try:
  68. opts, args = getopt.getopt(sys.argv[1:], 'hqao:', ['option='])
  69. except getopt.error as msg:
  70. usage(1, msg)
  71. loud = True
  72. all_data = False
  73. for opt, arg in opts:
  74. if opt == '-h':
  75. usage(0)
  76. elif opt == '-q':
  77. loud = False
  78. elif opt == '-a':
  79. all_data = True
  80. elif opt in ('-o', '--option'):
  81. options.set_from_cmdline(arg, sys.stderr)
  82. data = [] # list of (time_received, dirname, basename) triples
  83. if loud:
  84. print("Scanning everything")
  85. now = time.time()
  86. hdir = os.path.dirname(options["TestDriver", "ham_directories"])
  87. sdir = os.path.dirname(options["TestDriver", "spam_directories"])
  88. if all_data:
  89. hdir = os.path.dirname(hdir)
  90. sdir = os.path.dirname(sdir)
  91. files = glob.glob(os.path.join(hdir, "*", "*", "*"))
  92. if sdir != hdir:
  93. files.extend(glob.glob(os.path.join(sdir, "*", "*", "*")))
  94. else:
  95. files = glob.glob(os.path.join(hdir, "*", "*"))
  96. files.extend(glob.glob(os.path.join(sdir, "*", "*")))
  97. for name in files:
  98. if loud:
  99. sys.stdout.write("%-78s\r" % name)
  100. sys.stdout.flush()
  101. when_received = get_time(name) or now
  102. data.append((when_received,) + split(name))
  103. if loud:
  104. print("")
  105. print("Sorting ...")
  106. data.sort()
  107. if loud:
  108. print("Renaming first pass ...")
  109. for dummy, dirname, basename in data:
  110. os.rename(join(dirname, basename),
  111. join(dirname, "-" + basename))
  112. if loud:
  113. print("Renaming second pass ...")
  114. earliest = data[0][0] # timestamp of earliest msg received
  115. i = 0
  116. for when_received, dirname, basename in data:
  117. extension = os.path.splitext(basename)[-1]
  118. group = int((when_received - earliest) / SECONDS_PER_DAY)
  119. newbasename = "%04d-%06d" % (group, i)
  120. os.rename(join(dirname, "-" + basename),
  121. join(dirname, newbasename + extension))
  122. i += 1
  123. if __name__ == "__main__":
  124. main()