PageRenderTime 27ms CodeModel.GetById 19ms app.highlight 6ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/plotting/histogram.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 101 lines | 86 code | 11 blank | 4 comment | 18 complexity | a6b4b022439f285578766354544feb80 MD5 | raw file
  1#!/usr/bin/env python
  2#Greg Von Kuster
  3
  4import sys
  5from rpy import *
  6
  7assert sys.version_info[:2] >= ( 2, 4 )
  8
  9def stop_err(msg):
 10    sys.stderr.write(msg)
 11    sys.exit()
 12
 13def main():
 14
 15    # Handle input params
 16    in_fname = sys.argv[1]
 17    out_fname = sys.argv[2] 
 18    try:
 19        column = int( sys.argv[3] ) - 1
 20    except:
 21        stop_err( "Column not specified, your query does not contain a column of numerical data." )
 22    title = sys.argv[4]
 23    xlab = sys.argv[5]
 24    breaks = int( sys.argv[6] )
 25    if breaks == 0:
 26        breaks = "Sturges"
 27    if sys.argv[7] == "true":
 28        density = True
 29    else: density = False
 30    if len( sys.argv ) >= 9 and sys.argv[8] == "true":
 31        frequency = True
 32    else: frequency = False
 33
 34    matrix = []
 35    skipped_lines = 0
 36    first_invalid_line = 0
 37    invalid_value = ''
 38    i = 0
 39    for i, line in enumerate( file( in_fname ) ):
 40        valid = True
 41        line = line.rstrip('\r\n')
 42        # Skip comments
 43        if line and not line.startswith( '#' ): 
 44            # Extract values and convert to floats
 45            row = []
 46            try:
 47                fields = line.split( "\t" )
 48                val = fields[column]
 49                if val.lower() == "na":
 50                    row.append( float( "nan" ) )
 51            except:
 52                valid = False
 53                skipped_lines += 1
 54                if not first_invalid_line:
 55                    first_invalid_line = i+1
 56            else:
 57                try:
 58                    row.append( float( val ) )
 59                except ValueError:
 60                    valid = False
 61                    skipped_lines += 1
 62                    if not first_invalid_line:
 63                        first_invalid_line = i+1
 64                        invalid_value = fields[column]
 65        else:
 66            valid = False
 67            skipped_lines += 1
 68            if not first_invalid_line:
 69                first_invalid_line = i+1
 70
 71        if valid:
 72            matrix += row
 73
 74    if skipped_lines < i:
 75        try:
 76            a = r.array( matrix )
 77            r.pdf( out_fname, 8, 8 )
 78            histogram = r.hist( a, probability=not frequency, main=title, xlab=xlab, breaks=breaks )
 79            if density:
 80                density = r.density( a )
 81                if frequency:
 82                    scale_factor = len( matrix ) * ( histogram['mids'][1] - histogram['mids'][0] ) #uniform bandwidth taken from first 2 midpoints
 83                    density[ 'y' ] = map( lambda x: x * scale_factor, density[ 'y' ] )
 84                r.lines( density )
 85            r.dev_off()
 86        except Exception, exc:
 87            stop_err( "%s" %str( exc ) )
 88    else:
 89        if i == 0:
 90            stop_err("Input dataset is empty.")
 91        else:
 92            stop_err( "All values in column %s are non-numeric." %sys.argv[3] )
 93
 94    print "Histogram of column %s. " %sys.argv[3]
 95    if skipped_lines > 0:
 96        print "Skipped %d invalid lines starting with line #%d, '%s'." % ( skipped_lines, first_invalid_line, invalid_value )
 97
 98    r.quit( save="no" )
 99    
100if __name__ == "__main__":
101    main()