/scripts/tools/annotation_profiler/build_profile_indexes.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 338 lines · 281 code · 24 blank · 33 comment · 87 complexity · 82ec483a55dc046aa41c662f0a94d06d MD5 · raw file

  1. #!/usr/bin/env python
  2. #Dan Blankenberg
  3. VERSION = '1.0.0' # version of this script
  4. from optparse import OptionParser
  5. import os, gzip, struct, time
  6. from ftplib import FTP #do we want a diff method than using FTP to determine Chrom Names, eg use local copy
  7. #import md5 from hashlib; if python2.4 or less, use old md5
  8. try:
  9. from hashlib import md5
  10. except ImportError:
  11. from md5 import new as md5
  12. #import BitSet from bx-python, try using eggs and package resources, fall back to any local installation
  13. try:
  14. from galaxy import eggs
  15. import pkg_resources
  16. pkg_resources.require( "bx-python" )
  17. except: pass #Maybe there is a local installation available
  18. from bx.bitset import BitSet
  19. #Define constants
  20. STRUCT_FMT = '<I'
  21. STRUCT_SIZE = struct.calcsize( STRUCT_FMT )
  22. DEFAULT_BITSET_SIZE = 300000000
  23. CHUNK_SIZE = 1024
  24. #Headers used to parse .sql files to determine column indexes for chromosome name, start and end
  25. alias_spec = {
  26. 'chromCol' : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name', 'tName' ],
  27. 'startCol' : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)', 'tStart', 'genoStart' ],
  28. 'endCol' : [ 'end' , 'END' , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)', 'tEnd', 'genoEnd' ],
  29. }
  30. #Headers used to parse trackDb.txt.gz
  31. #TODO: these should be parsed directly from trackDb.sql
  32. trackDb_headers = ["tableName", "shortLabel", "type", "longLabel", "visibility", "priority", "colorR", "colorG", "colorB", "altColorR", "altColorG", "altColorB", "useScore", "private", "restrictCount", "restrictList", "url", "html", "grp", "canPack", "settings"]
  33. def get_columns( filename ):
  34. input_sql = open( filename ).read()
  35. input_sql = input_sql.split( 'CREATE TABLE ' )[1].split( ';' )[0]
  36. input_sql = input_sql.split( ' (', 1 )
  37. table_name = input_sql[0].strip().strip( '`' )
  38. input_sql = [ split.strip().split( ' ' )[0].strip().strip( '`' ) for split in input_sql[1].rsplit( ')', 1 )[0].strip().split( '\n' ) ]
  39. print input_sql
  40. chrom_col = None
  41. start_col = None
  42. end_col = None
  43. for col_name in alias_spec['chromCol']:
  44. for i, header_name in enumerate( input_sql ):
  45. if col_name == header_name:
  46. chrom_col = i
  47. break
  48. if chrom_col is not None:
  49. break
  50. for col_name in alias_spec['startCol']:
  51. for i, header_name in enumerate( input_sql ):
  52. if col_name == header_name:
  53. start_col = i
  54. break
  55. if start_col is not None:
  56. break
  57. for col_name in alias_spec['endCol']:
  58. for i, header_name in enumerate( input_sql ):
  59. if col_name == header_name:
  60. end_col = i
  61. break
  62. if end_col is not None:
  63. break
  64. return table_name, chrom_col, start_col, end_col
  65. def create_grouping_xml( input_dir, output_dir, dbkey ):
  66. output_filename = os.path.join( output_dir, '%s_tables.xml' % dbkey )
  67. def load_groups( file_name = 'grp.txt.gz' ):
  68. groups = {}
  69. for line in gzip.open( os.path.join( input_dir, file_name ) ):
  70. fields = line.split( '\t' )
  71. groups[fields[0]] = { 'desc': fields[1], 'priority': fields[2] }
  72. return groups
  73. f = gzip.open( os.path.join( input_dir, 'trackDb.txt.gz' ) )
  74. out = open( output_filename, 'wb' )
  75. tables = {}
  76. cur_buf = ''
  77. while True:
  78. line = f.readline()
  79. if not line: break
  80. #remove new lines
  81. line = line.rstrip( '\n\r' )
  82. line = line.replace( '\\\t', ' ' ) #replace escaped tabs with space
  83. cur_buf += "%s\n" % line.rstrip( '\\' )
  84. if line.endswith( '\\' ):
  85. continue #line is wrapped, next line
  86. #all fields should be loaded now...
  87. fields = cur_buf.split( '\t' )
  88. cur_buf = '' #reset buffer
  89. assert len( fields ) == len( trackDb_headers ), 'Failed Parsing trackDb.txt.gz; fields: %s' % fields
  90. table_name = fields[ 0 ]
  91. tables[ table_name ] = {}
  92. for field_name, field_value in zip( trackDb_headers, fields ):
  93. tables[ table_name ][ field_name ] = field_value
  94. #split settings fields into dict
  95. fields = fields[-1].split( '\n' )
  96. tables[ table_name ][ 'settings' ] = {}
  97. for field in fields:
  98. setting_fields = field.split( ' ', 1 )
  99. setting_name = setting_value = setting_fields[ 0 ]
  100. if len( setting_fields ) > 1:
  101. setting_value = setting_fields[ 1 ]
  102. if setting_name or setting_value:
  103. tables[ table_name ][ 'settings' ][ setting_name ] = setting_value
  104. #Load Groups
  105. groups = load_groups()
  106. in_groups = {}
  107. for table_name, values in tables.iteritems():
  108. if os.path.exists( os.path.join( output_dir, table_name ) ):
  109. group = values['grp']
  110. if group not in in_groups:
  111. in_groups[group]={}
  112. #***NAME CHANGE***, 'subTrack' no longer exists as a setting...use 'parent' instead
  113. #subTrack = values.get('settings', {} ).get( 'subTrack', table_name )
  114. subTrack = values.get('settings', {} ).get( 'parent', table_name ).split( ' ' )[0] #need to split, because could be e.g. 'trackgroup on'
  115. if subTrack not in in_groups[group]:
  116. in_groups[group][subTrack]=[]
  117. in_groups[group][subTrack].append( table_name )
  118. assigned_tables = []
  119. out.write( """<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="%s">\n""" % ( dbkey ) )
  120. out.write( " <options>\n" )
  121. for group, subTracks in sorted( in_groups.iteritems() ):
  122. out.write( """ <option name="%s" value="group-%s">\n""" % ( groups[group]['desc'], group ) )
  123. for sub_name, sub_tracks in subTracks.iteritems():
  124. if len( sub_tracks ) > 1:
  125. out.write( """ <option name="%s" value="subtracks-%s">\n""" % ( sub_name, sub_name ) )
  126. sub_tracks.sort()
  127. for track in sub_tracks:
  128. track_label = track
  129. if "$" not in tables[track]['shortLabel']:
  130. track_label = tables[track]['shortLabel']
  131. out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) )
  132. assigned_tables.append( track )
  133. out.write( " </option>\n" )
  134. else:
  135. track = sub_tracks[0]
  136. track_label = track
  137. if "$" not in tables[track]['shortLabel']:
  138. track_label = tables[track]['shortLabel']
  139. out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) )
  140. assigned_tables.append( track )
  141. out.write( " </option>\n" )
  142. unassigned_tables = list( sorted( [ table_dir for table_dir in os.listdir( output_dir ) if table_dir not in assigned_tables and os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ) )
  143. if unassigned_tables:
  144. out.write( """ <option name="Uncategorized Tables" value="group-trackDbUnassigned">\n""" )
  145. for table_name in unassigned_tables:
  146. out.write( """ <option name="%s" value="%s"/>\n""" % ( table_name, table_name ) )
  147. out.write( " </option>\n" )
  148. out.write( " </options>\n" )
  149. out.write( """</filter>\n""" )
  150. out.close()
  151. def write_database_dump_info( input_dir, output_dir, dbkey, chrom_lengths, default_bitset_size ):
  152. #generate hash for profiled table directories
  153. #sort directories off output root (files in output root not hashed, including the profiler_info.txt file)
  154. #sort files in each directory and hash file contents
  155. profiled_hash = md5()
  156. for table_dir in sorted( [ table_dir for table_dir in os.listdir( output_dir ) if os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ):
  157. for filename in sorted( os.listdir( os.path.join( output_dir, table_dir ) ) ):
  158. f = open( os.path.join( output_dir, table_dir, filename ), 'rb' )
  159. while True:
  160. hash_chunk = f.read( CHUNK_SIZE )
  161. if not hash_chunk:
  162. break
  163. profiled_hash.update( hash_chunk )
  164. profiled_hash = profiled_hash.hexdigest()
  165. #generate hash for input dir
  166. #sort directories off input root
  167. #sort files in each directory and hash file contents
  168. database_hash = md5()
  169. for dirpath, dirnames, filenames in sorted( os.walk( input_dir ) ):
  170. for filename in sorted( filenames ):
  171. f = open( os.path.join( input_dir, dirpath, filename ), 'rb' )
  172. while True:
  173. hash_chunk = f.read( CHUNK_SIZE )
  174. if not hash_chunk:
  175. break
  176. database_hash.update( hash_chunk )
  177. database_hash = database_hash.hexdigest()
  178. #write out info file
  179. out = open( os.path.join( output_dir, 'profiler_info.txt' ), 'wb' )
  180. out.write( 'dbkey\t%s\n' % ( dbkey ) )
  181. out.write( 'chromosomes\t%s\n' % ( ','.join( [ '%s=%s' % ( chrom_name, chrom_len ) for chrom_name, chrom_len in chrom_lengths.iteritems() ] ) ) )
  182. out.write( 'bitset_size\t%s\n' % ( default_bitset_size ) )
  183. for line in open( os.path.join( input_dir, 'trackDb.sql' ) ):
  184. line = line.strip()
  185. if line.startswith( '-- Dump completed on ' ):
  186. line = line[ len( '-- Dump completed on ' ): ]
  187. out.write( 'dump_time\t%s\n' % ( line ) )
  188. break
  189. out.write( 'dump_hash\t%s\n' % ( database_hash ) )
  190. out.write( 'profiler_time\t%s\n' % ( time.time() ) )
  191. out.write( 'profiler_hash\t%s\n' % ( profiled_hash ) )
  192. out.write( 'profiler_version\t%s\n' % ( VERSION ) )
  193. out.write( 'profiler_struct_format\t%s\n' % ( STRUCT_FMT ) )
  194. out.write( 'profiler_struct_size\t%s\n' % ( STRUCT_SIZE ) )
  195. out.close()
  196. def __main__():
  197. usage = "usage: %prog options"
  198. parser = OptionParser( usage=usage )
  199. parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' )
  200. parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' )
  201. parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' )
  202. parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' )
  203. parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' )
  204. parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' )
  205. parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' )
  206. ( options, args ) = parser.parse_args()
  207. input_dir = options.input_dir
  208. if '%' in input_dir:
  209. input_dir = input_dir % options.dbkey
  210. assert os.path.exists( input_dir ), 'Input directory does not exist'
  211. output_dir = options.output_dir
  212. if '%' in output_dir:
  213. output_dir = output_dir % options.dbkey
  214. assert not os.path.exists( output_dir ), 'Output directory already exists'
  215. os.makedirs( output_dir )
  216. ftp_path = options.ftp_path
  217. if '%' in ftp_path:
  218. ftp_path = ftp_path % options.dbkey
  219. #Get chromosome names and lengths
  220. chrom_lengths = {}
  221. if options.chromosomes:
  222. for chrom in options.chromosomes.split( ',' ):
  223. fields = chrom.split( '=' )
  224. chrom = fields[0]
  225. if len( fields ) > 1:
  226. chrom_len = int( fields[1] )
  227. else:
  228. chrom_len = options.bitset_size
  229. chrom_lengths[ chrom ] = chrom_len
  230. chroms = chrom_lengths.keys()
  231. print 'Chrom info taken from command line option.'
  232. else:
  233. try:
  234. for line in gzip.open( os.path.join( input_dir, 'chromInfo.txt.gz' ) ):
  235. fields = line.strip().split( '\t' )
  236. chrom_lengths[ fields[0] ] = int( fields[ 1 ] )
  237. chroms = chrom_lengths.keys()
  238. print 'Chrom info taken from chromInfo.txt.gz.'
  239. except Exception, e:
  240. print 'Error loading chrom info from chromInfo.txt.gz, trying FTP method.'
  241. chrom_lengths = {} #zero out chrom_lengths
  242. chroms = []
  243. ftp = FTP( options.ftp_site )
  244. ftp.login()
  245. for name in ftp.nlst( ftp_path ):
  246. if name.endswith( '.fa.gz' ):
  247. chroms.append( name.split( '/' )[-1][ :-len( '.fa.gz' ) ] )
  248. ftp.close()
  249. for chrom in chroms:
  250. chrom_lengths[ chrom ] = options.bitset_size
  251. #sort chroms by length of name, decending; necessary for when table names start with chrom name
  252. chroms = list( reversed( [ chrom for chrom_len, chrom in sorted( [ ( len( chrom ), chrom ) for chrom in chroms ] ) ] ) )
  253. #parse tables from local files
  254. #loop through directory contents, if file ends in '.sql', process table
  255. for filename in os.listdir( input_dir ):
  256. if filename.endswith ( '.sql' ):
  257. base_filename = filename[ 0:-len( '.sql' ) ]
  258. table_out_dir = os.path.join( output_dir, base_filename )
  259. #some tables are chromosome specific, lets strip off the chrom name
  260. for chrom in chroms:
  261. if base_filename.startswith( "%s_" % chrom ):
  262. #found chromosome
  263. table_out_dir = os.path.join( output_dir, base_filename[len( "%s_" % chrom ):] )
  264. break
  265. #create table dir
  266. if not os.path.exists( table_out_dir ):
  267. os.mkdir( table_out_dir ) #table dir may already exist in the case of single chrom tables
  268. print "Created table dir (%s)." % table_out_dir
  269. else:
  270. print "Table dir (%s) already exists." % table_out_dir
  271. #find column assignments
  272. table_name, chrom_col, start_col, end_col = get_columns( "%s.sql" % os.path.join( input_dir, base_filename ) )
  273. if chrom_col is None or start_col is None or end_col is None:
  274. print "Table %s (%s) does not appear to have a chromosome, a start, or a stop." % ( table_name, "%s.sql" % os.path.join( input_dir, base_filename ) )
  275. if not os.listdir( table_out_dir ):
  276. print "Removing empty table (%s) directory (%s)." % ( table_name, table_out_dir )
  277. os.rmdir( table_out_dir )
  278. continue
  279. #build bitsets from table
  280. bitset_dict = {}
  281. for line in gzip.open( '%s.txt.gz' % os.path.join( input_dir, base_filename ) ):
  282. fields = line.strip().split( '\t' )
  283. chrom = fields[ chrom_col ]
  284. start = int( fields[ start_col ] )
  285. end = int( fields[ end_col ] )
  286. if chrom not in bitset_dict:
  287. bitset_dict[ chrom ] = BitSet( chrom_lengths.get( chrom, options.bitset_size ) )
  288. bitset_dict[ chrom ].set_range( start, end - start )
  289. #write bitsets as profiled annotations
  290. for chrom_name, chrom_bits in bitset_dict.iteritems():
  291. out = open( os.path.join( table_out_dir, '%s.covered' % chrom_name ), 'wb' )
  292. end = 0
  293. total_regions = 0
  294. total_coverage = 0
  295. max_size = chrom_lengths.get( chrom_name, options.bitset_size )
  296. while True:
  297. start = chrom_bits.next_set( end )
  298. if start >= max_size:
  299. break
  300. end = chrom_bits.next_clear( start )
  301. out.write( struct.pack( STRUCT_FMT, start ) )
  302. out.write( struct.pack( STRUCT_FMT, end ) )
  303. total_regions += 1
  304. total_coverage += end - start
  305. if end >= max_size:
  306. break
  307. out.close()
  308. open( os.path.join( table_out_dir, '%s.total_regions' % chrom_name ), 'wb' ).write( str( total_regions ) )
  309. open( os.path.join( table_out_dir, '%s.total_coverage' % chrom_name ), 'wb' ).write( str( total_coverage ) )
  310. #create xml
  311. create_grouping_xml( input_dir, output_dir, options.dbkey )
  312. #create database dump info file, for database version control
  313. write_database_dump_info( input_dir, output_dir, options.dbkey, chrom_lengths, options.bitset_size )
  314. if __name__ == "__main__": __main__()