/tools/filters/random_lines_two_pass.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 71 lines · 49 code · 13 blank · 9 comment · 10 complexity · c7bd7a10e661639fc4e57fe7505f7e70 MD5 · raw file

  1. #!/usr/bin/env python
  2. #Dan Blankenberg
  3. #Selects N random lines from a file and outputs to another file, maintaining original line order
  4. #allows specifying a seed
  5. #does two passes to determine line offsets/count, and then to output contents
  6. import optparse, random
  7. def get_random_by_subtraction( line_offsets, num_lines ):
  8. while len( line_offsets ) > num_lines:
  9. del line_offsets[ random.randint( 0, len( line_offsets ) - 1 ) ]
  10. return line_offsets
  11. def get_random_by_sample( line_offsets, num_lines ):
  12. line_offsets = random.sample( line_offsets, num_lines )
  13. line_offsets.sort()
  14. return line_offsets
  15. def get_random( line_offsets, num_lines ):
  16. if num_lines > ( len( line_offsets ) / 2 ):
  17. return get_random_by_subtraction( line_offsets, num_lines )
  18. else:
  19. return get_random_by_sample( line_offsets, num_lines )
  20. def __main__():
  21. #Parse Command Line
  22. parser = optparse.OptionParser()
  23. parser.add_option( '-s', '--seed', dest='seed', action='store', type="string", default=None, help='Set the random seed.' )
  24. (options, args) = parser.parse_args()
  25. assert len( args ) == 3, "Invalid command line specified."
  26. input = open( args[0], 'rb' )
  27. output = open( args[1], 'wb' )
  28. num_lines = int( args[2] )
  29. assert num_lines > 0, "You must select at least one line."
  30. if options.seed is not None:
  31. random.seed( options.seed )
  32. #get line offsets
  33. line_offsets = []
  34. teller = input.tell
  35. readliner = input.readline
  36. appender = line_offsets.append
  37. while True:
  38. offset = teller()
  39. if readliner():
  40. appender( offset )
  41. else:
  42. break
  43. total_lines = len( line_offsets )
  44. assert num_lines <= total_lines, "Error: asked to select more lines (%i) than there were in the file (%i)." % ( num_lines, total_lines )
  45. #get random line offsets
  46. line_offsets = get_random( line_offsets, num_lines )
  47. #write out random lines
  48. seeker = input.seek
  49. writer = output.write
  50. for line_offset in line_offsets:
  51. seeker( line_offset )
  52. writer( readliner() )
  53. input.close()
  54. output.close()
  55. print "Kept %i of %i total lines." % ( num_lines, total_lines )
  56. if options.seed is not None:
  57. print 'Used random seed of "%s".' % options.seed
  58. if __name__=="__main__": __main__()