PageRenderTime 38ms CodeModel.GetById 17ms app.highlight 15ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/filters/random_lines_two_pass.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 71 lines | 49 code | 13 blank | 9 comment | 11 complexity | c7bd7a10e661639fc4e57fe7505f7e70 MD5 | raw file
 1#!/usr/bin/env python
 2#Dan Blankenberg
 3#Selects N random lines from a file and outputs to another file, maintaining original line order
 4#allows specifying a seed
 5#does two passes to determine line offsets/count, and then to output contents
 6
 7import optparse, random
 8
 9def get_random_by_subtraction( line_offsets, num_lines ):
10    while len( line_offsets ) > num_lines:
11        del line_offsets[ random.randint( 0, len( line_offsets ) - 1 ) ]
12    return line_offsets
13
14def get_random_by_sample( line_offsets, num_lines ):
15    line_offsets = random.sample( line_offsets, num_lines )
16    line_offsets.sort()
17    return line_offsets
18
19def get_random( line_offsets, num_lines ):
20    if num_lines > ( len( line_offsets ) / 2 ):
21        return get_random_by_subtraction( line_offsets, num_lines )
22    else:
23        return get_random_by_sample( line_offsets, num_lines )
24
25def __main__():
26    #Parse Command Line
27    parser = optparse.OptionParser()
28    parser.add_option( '-s', '--seed', dest='seed', action='store', type="string", default=None, help='Set the random seed.' )
29    (options, args) = parser.parse_args()
30    
31    assert len( args ) == 3, "Invalid command line specified."
32    
33    input = open( args[0], 'rb' )
34    output = open( args[1], 'wb' )
35    num_lines = int( args[2] )
36    assert num_lines > 0, "You must select at least one line."
37    
38    if options.seed is not None:
39        random.seed( options.seed )
40    
41    #get line offsets
42    line_offsets = []
43    teller = input.tell
44    readliner = input.readline
45    appender = line_offsets.append
46    while True:
47        offset = teller()
48        if readliner():
49            appender( offset )
50        else:
51            break
52    
53    total_lines = len( line_offsets )
54    assert num_lines <= total_lines, "Error: asked to select more lines (%i) than there were in the file (%i)." % ( num_lines, total_lines )
55    
56    #get random line offsets
57    line_offsets = get_random( line_offsets, num_lines )
58    
59    #write out random lines
60    seeker = input.seek
61    writer = output.write
62    for line_offset in line_offsets:
63        seeker( line_offset )
64        writer( readliner() )
65    input.close()
66    output.close()
67    print "Kept %i of %i total lines." % ( num_lines, total_lines )
68    if options.seed is not None:
69        print 'Used random seed of "%s".' % options.seed
70    
71if __name__=="__main__": __main__()