/tools/stats/gsummary.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 114 lines · 91 code · 15 blank · 8 comment · 31 complexity · a7ea785f992504f41d51b08cfdd64b4a MD5 · raw file

  1. #!/usr/bin/env python
  2. import sys, re, tempfile
  3. from rpy import *
  4. # Older py compatibility
  5. try:
  6. set()
  7. except:
  8. from sets import Set as set
  9. assert sys.version_info[:2] >= ( 2, 4 )
  10. def stop_err( msg ):
  11. sys.stderr.write( msg )
  12. sys.exit()
  13. def S3_METHODS( all="key" ):
  14. Group_Math = [ "abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif",
  15. "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh",
  16. "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma",
  17. "cumsum", "cumprod", "cummax", "cummin", "c" ]
  18. Group_Ops = [ "+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", "," ]
  19. if all is "key":
  20. return { 'Math' : Group_Math, 'Ops' : Group_Ops }
  21. def main():
  22. try:
  23. datafile = sys.argv[1]
  24. outfile_name = sys.argv[2]
  25. expression = sys.argv[3]
  26. except:
  27. stop_err( 'Usage: python gsummary.py input_file ouput_file expression' )
  28. math_allowed = S3_METHODS()[ 'Math' ]
  29. ops_allowed = S3_METHODS()[ 'Ops' ]
  30. # Check for invalid expressions
  31. for word in re.compile( '[a-zA-Z]+' ).findall( expression ):
  32. if word and not word in math_allowed:
  33. stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) )
  34. symbols = set()
  35. for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ):
  36. if symbol and not symbol in ops_allowed:
  37. stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) )
  38. else:
  39. symbols.add( symbol )
  40. if len( symbols ) == 1 and ',' in symbols:
  41. # User may have entered a comma-separated list r_data_frame columns
  42. stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression )
  43. # Find all column references in the expression
  44. cols = []
  45. for col in re.compile( 'c[0-9]+' ).findall( expression ):
  46. try:
  47. cols.append( int( col[1:] ) - 1 )
  48. except:
  49. pass
  50. tmp_file = tempfile.NamedTemporaryFile( 'w+b' )
  51. # Write the R header row to the temporary file
  52. hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols )
  53. tmp_file.write( "%s\n" % hdr_str )
  54. skipped_lines = 0
  55. first_invalid_line = 0
  56. i = 0
  57. for i, line in enumerate( file( datafile ) ):
  58. line = line.rstrip( '\r\n' )
  59. if line and not line.startswith( '#' ):
  60. valid = True
  61. fields = line.split( '\t' )
  62. # Write the R data row to the temporary file
  63. for col in cols:
  64. try:
  65. float( fields[ col ] )
  66. except:
  67. skipped_lines += 1
  68. if not first_invalid_line:
  69. first_invalid_line = i + 1
  70. valid = False
  71. break
  72. if valid:
  73. data_str = "\t".join( fields[ col ] for col in cols )
  74. tmp_file.write( "%s\n" % data_str )
  75. tmp_file.flush()
  76. if skipped_lines == i + 1:
  77. stop_err( "Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements." )
  78. else:
  79. # summary function and return labels
  80. summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" )
  81. headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ]
  82. headings_str = "\t".join( headings )
  83. set_default_mode( NO_CONVERSION )
  84. r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" )
  85. outfile = open( outfile_name, 'w' )
  86. for col in re.compile( 'c[0-9]+' ).findall( expression ):
  87. r.assign( col, r[ "$" ]( r_data_frame, col ) )
  88. try:
  89. summary = summary_func( r( expression ) )
  90. except RException, s:
  91. outfile.close()
  92. stop_err( "Computation resulted in the following error: %s" % str( s ) )
  93. summary = summary.as_py( BASIC_CONVERSION )
  94. outfile.write( "#%s\n" % headings_str )
  95. outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary[ k ] ) for k in headings ] ) )
  96. outfile.close()
  97. if skipped_lines:
  98. print "Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % ( skipped_lines, first_invalid_line )
  99. if __name__ == "__main__": main()