PageRenderTime 48ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/TEES/Core/DivideExamples.py

https://bitbucket.org/yumyai/tees
Python | 76 lines | 61 code | 12 blank | 3 comment | 15 complexity | a80653f3982669bd8eea6c601498e1d4 MD5 | raw file
  1. """
  2. Pseudorandomly distributed subsets
  3. """
  4. __version__ = "$Revision: 1.4 $"
  5. import sys
  6. from TEES.Core import Split
  7. def getDocumentId(idString):
  8. return idString.rsplit(".",2)[0]
  9. def getIdFromLine(line):
  10. assert(line.find("#") != -1)
  11. return line.split("#")[-1].strip()
  12. def getDocumentIds(filename):
  13. documentIds = []
  14. inputFile = open(filename, "rt")
  15. try:
  16. for line in inputFile:
  17. if len(line) == 0 or line[0] == "#":
  18. continue
  19. docId = getDocumentId(getIdFromLine(line))
  20. if not docId in documentIds:
  21. documentIds.append(docId)
  22. finally:
  23. inputFile.close()
  24. return documentIds
  25. def getDocumentFolds(documentIds, folds):
  26. sample = Split.getFolds(len(documentIds),folds)
  27. division = {}
  28. for i in range(len(documentIds)):
  29. division[documentIds[i]] = sample[i]
  30. return division
  31. def divideExamples(filename, outputFilenames):
  32. print >> sys.stderr, "Reading document ids"
  33. documentIds = getDocumentIds(filename)
  34. print >> sys.stderr, "Dividing documents into folds"
  35. division = getDocumentFolds(documentIds, len(outputFilenames))
  36. print >> sys.stderr, "Dividing examples"
  37. outputFiles = []
  38. for name in outputFilenames:
  39. outputFiles.append(open(name, "wt"))
  40. inputFile = open(filename, "rt")
  41. try:
  42. for line in inputFile:
  43. if len(line) == 0 or line[0] == "#":
  44. continue
  45. docId = getDocumentId(getIdFromLine(line))
  46. outputFiles[division[docId]].write(line)
  47. finally:
  48. inputFile.close()
  49. for outputFile in outputFiles:
  50. outputFile.close()
  51. if __name__=="__main__":
  52. from optparse import OptionParser
  53. defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml"
  54. optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
  55. optparser.add_option("-i", "--input", default=defaultAnalysisFilename, dest="input", help="Corpus in analysis format", metavar="FILE")
  56. optparser.add_option("-o", "--output", default="", dest="output", help="Output directory")
  57. optparser.add_option("-f", "--folds", type="int", default=10, dest="folds", help="X-fold cross validation")
  58. (options, args) = optparser.parse_args()
  59. outputFilenames = []
  60. for i in range(options.folds):
  61. outputFilenames.append(options.output + options.input + ".fold" + str(i))
  62. divideExamples(options.input, outputFilenames)