/lib/galaxy/visualization/tracks/data_providers.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 837 lines · 657 code · 63 blank · 117 comment · 88 complexity · 811312d82113def83efdf8ca968c0a76 MD5 · raw file

  1. """
  2. Data providers for tracks visualizations.
  3. """
  4. import sys
  5. from math import ceil, log
  6. import pkg_resources
  7. pkg_resources.require( "bx-python" )
  8. if sys.version_info[:2] == (2, 4):
  9. pkg_resources.require( "ctypes" )
  10. pkg_resources.require( "pysam" )
  11. pkg_resources.require( "numpy" )
  12. from galaxy.datatypes.util.gff_util import *
  13. from galaxy.util.json import from_json_string
  14. from bx.interval_index_file import Indexes
  15. from bx.bbi.bigwig_file import BigWigFile
  16. from galaxy.util.lrucache import LRUCache
  17. from galaxy.visualization.tracks.summary import *
  18. import galaxy_utils.sequence.vcf
  19. from galaxy.datatypes.tabular import Vcf
  20. from galaxy.datatypes.interval import Bed, Gff, Gtf
  21. from pysam import csamtools, ctabix
  22. ERROR_MAX_VALS = "Only the first %i %s in this region are displayed."
  23. # Return None instead of NaN to pass jQuery 1.4's strict JSON
  24. def float_nan(n):
  25. if n != n: # NaN != NaN
  26. return None
  27. else:
  28. return float(n)
  29. def get_bounds( reads, start_pos_index, end_pos_index ):
  30. """
  31. Returns the minimum and maximum position for a set of reads.
  32. """
  33. max_low = sys.maxint
  34. max_high = -sys.maxint
  35. for read in reads:
  36. if read[ start_pos_index ] < max_low:
  37. max_low = read[ start_pos_index ]
  38. if read[ end_pos_index ] > max_high:
  39. max_high = read[ end_pos_index ]
  40. return max_low, max_high
  41. class TracksDataProvider( object ):
  42. """ Base class for tracks data providers. """
  43. """
  44. Mapping from column name to payload data; this mapping is used to create
  45. filters. Key is column name, value is a dict with mandatory key 'index' and
  46. optional key 'name'. E.g. this defines column 4
  47. col_name_data_attr_mapping = {4 : { index: 5, name: 'Score' } }
  48. """
  49. col_name_data_attr_mapping = {}
  50. def __init__( self, converted_dataset=None, original_dataset=None, dependencies=None ):
  51. """ Create basic data provider. """
  52. self.converted_dataset = converted_dataset
  53. self.original_dataset = original_dataset
  54. self.dependencies = dependencies
  55. def write_data_to_file( self, chrom, start, end, filename ):
  56. """
  57. Write data in region defined by chrom, start, and end to a file.
  58. """
  59. # Override.
  60. pass
  61. def valid_chroms( self ):
  62. """
  63. Returns chroms/contigs that the dataset contains
  64. """
  65. return None # by default
  66. def has_data( self, chrom, start, end, **kwargs ):
  67. """
  68. Returns true if dataset has data in the specified genome window, false
  69. otherwise.
  70. """
  71. # Override.
  72. pass
  73. def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
  74. """
  75. Returns data in region defined by chrom, start, and end. start_val and
  76. max_vals are used to denote the data to return: start_val is the first element to
  77. return and max_vals indicates the number of values to return.
  78. Return value must be a dictionary with the following attributes:
  79. dataset_type, data
  80. """
  81. # Override.
  82. pass
  83. def get_filters( self ):
  84. """
  85. Returns filters for provider's data. Return value is a list of
  86. filters; each filter is a dictionary with the keys 'name', 'index', 'type'.
  87. NOTE: This method uses the original dataset's datatype and metadata to
  88. create the filters.
  89. """
  90. # Get column names.
  91. try:
  92. column_names = self.original_dataset.datatype.column_names
  93. except AttributeError:
  94. try:
  95. column_names = range( self.original_dataset.metadata.columns )
  96. except: # Give up
  97. return []
  98. # Dataset must have column types; if not, cannot create filters.
  99. try:
  100. column_types = self.original_dataset.metadata.column_types
  101. except AttributeError:
  102. return []
  103. # Create and return filters.
  104. filters = []
  105. if self.original_dataset.metadata.viz_filter_cols:
  106. for viz_col_index in self.original_dataset.metadata.viz_filter_cols:
  107. # Some columns are optional, so can't assume that a filter
  108. # column is in dataset.
  109. if viz_col_index >= len( column_names ):
  110. continue;
  111. col_name = column_names[ viz_col_index ]
  112. # Make sure that column has a mapped index. If not, do not add filter.
  113. try:
  114. attrs = self.col_name_data_attr_mapping[ col_name ]
  115. except KeyError:
  116. continue
  117. filters.append(
  118. { 'name' : attrs[ 'name' ], 'type' : column_types[viz_col_index], \
  119. 'index' : attrs[ 'index' ] } )
  120. return filters
  121. class BedDataProvider( TracksDataProvider ):
  122. """
  123. Abstract class that processes BED data from text format to payload format.
  124. Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ]
  125. """
  126. def get_iterator( self, chrom, start, end ):
  127. raise "Unimplemented Method"
  128. def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
  129. iterator = self.get_iterator( chrom, start, end )
  130. return self.process_data( iterator, start_val, max_vals, **kwargs )
  131. def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ):
  132. """
  133. Provides
  134. """
  135. # Build data to return. Payload format is:
  136. # [ <guid/offset>, <start>, <end>, <name>, <score>, <strand>, <thick_start>,
  137. # <thick_end>, <blocks> ]
  138. #
  139. # First three entries are mandatory, others are optional.
  140. #
  141. filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) )
  142. no_detail = ( "no_detail" in kwargs )
  143. rval = []
  144. message = None
  145. for count, line in enumerate( iterator ):
  146. if count < start_val:
  147. continue
  148. if max_vals and count-start_val >= max_vals:
  149. message = ERROR_MAX_VALS % ( max_vals, "features" )
  150. break
  151. # TODO: can we use column metadata to fill out payload?
  152. # TODO: use function to set payload data
  153. feature = line.split()
  154. length = len(feature)
  155. # Unique id is just a hash of the line
  156. payload = [ hash(line), int(feature[1]), int(feature[2]) ]
  157. if no_detail:
  158. rval.append( payload )
  159. continue
  160. # Simpler way to add stuff, but type casting is not done.
  161. # Name, score, strand, thick start, thick end.
  162. #end = min( len( feature ), 8 )
  163. #payload.extend( feature[ 3:end ] )
  164. # Name, strand, thick start, thick end.
  165. if length >= 4:
  166. payload.append(feature[3])
  167. if length >= 6:
  168. payload.append(feature[5])
  169. if length >= 8:
  170. payload.append(int(feature[6]))
  171. payload.append(int(feature[7]))
  172. # Blocks.
  173. if length >= 12:
  174. block_sizes = [ int(n) for n in feature[10].split(',') if n != '']
  175. block_starts = [ int(n) for n in feature[11].split(',') if n != '' ]
  176. blocks = zip( block_sizes, block_starts )
  177. payload.append( [ ( int(feature[1]) + block[1], int(feature[1]) + block[1] + block[0] ) for block in blocks ] )
  178. # Score (filter data)
  179. if length >= 5 and filter_cols and filter_cols[0] == "Score":
  180. payload.append( float(feature[4]) )
  181. rval.append( payload )
  182. return { 'data': rval, 'message': message }
  183. def write_data_to_file( self, chrom, start, end, filename ):
  184. iterator = self.get_iterator( chrom, start, end )
  185. out = open( filename, "w" )
  186. for line in iterator:
  187. out.write( "%s\n" % line )
  188. out.close()
  189. class SummaryTreeDataProvider( TracksDataProvider ):
  190. """
  191. Summary tree data provider for the Galaxy track browser.
  192. """
  193. CACHE = LRUCache(20) # Store 20 recently accessed indices for performance
  194. def valid_chroms( self ):
  195. st = summary_tree_from_file( self.converted_dataset.file_name )
  196. return st.chrom_blocks.keys()
  197. def get_summary( self, chrom, start, end, **kwargs ):
  198. filename = self.converted_dataset.file_name
  199. st = self.CACHE[filename]
  200. if st is None:
  201. st = summary_tree_from_file( self.converted_dataset.file_name )
  202. self.CACHE[filename] = st
  203. # If chrom is not found in blocks, try removing the first three
  204. # characters (e.g. 'chr') and see if that works. This enables the
  205. # provider to handle chrome names defined as chrXXX and as XXX.
  206. if chrom in st.chrom_blocks:
  207. pass
  208. elif chrom[3:] in st.chrom_blocks:
  209. chrom = chrom[3:]
  210. else:
  211. return None
  212. resolution = max(1, ceil(float(kwargs['resolution'])))
  213. level = ceil( log( resolution, st.block_size ) ) - 1
  214. level = int(max( level, 0 ))
  215. if level <= 1:
  216. return "detail"
  217. stats = st.chrom_stats[chrom]
  218. results = st.query(chrom, int(start), int(end), level)
  219. if results == "detail" or results == "draw":
  220. return results
  221. else:
  222. return results, stats[level]["max"], stats[level]["avg"], stats[level]["delta"]
  223. def has_data( self, chrom ):
  224. """
  225. Returns true if dataset has data for this chrom
  226. """
  227. # Get summary tree.
  228. filename = self.converted_dataset.file_name
  229. st = self.CACHE[filename]
  230. if st is None:
  231. st = summary_tree_from_file( self.converted_dataset.file_name )
  232. self.CACHE[filename] = st
  233. # Check for data.
  234. return st.chrom_blocks.get(chrom, None) is not None or (chrom and st.chrom_blocks.get(chrom[3:], None) is not None)
  235. class BamDataProvider( TracksDataProvider ):
  236. """
  237. Provides access to intervals from a sorted indexed BAM file.
  238. """
  239. def write_data_to_file( self, chrom, start, end, filename ):
  240. """
  241. Write reads in [chrom:start-end] to file.
  242. """
  243. # Open current BAM file using index.
  244. start, end = int(start), int(end)
  245. bamfile = csamtools.Samfile( filename=self.original_dataset.file_name, mode='rb', \
  246. index_filename=self.converted_dataset.file_name )
  247. try:
  248. data = bamfile.fetch(start=start, end=end, reference=chrom)
  249. except ValueError, e:
  250. # Some BAM files do not prefix chromosome names with chr, try without
  251. if chrom.startswith( 'chr' ):
  252. try:
  253. data = bamfile.fetch( start=start, end=end, reference=chrom[3:] )
  254. except ValueError:
  255. return None
  256. else:
  257. return None
  258. # Write new BAM file.
  259. # TODO: write headers as well?
  260. new_bamfile = csamtools.Samfile( template=bamfile, filename=filename, mode='wb' )
  261. for i, read in enumerate( data ):
  262. new_bamfile.write( read )
  263. new_bamfile.close()
  264. # Cleanup.
  265. bamfile.close()
  266. def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ):
  267. """
  268. Fetch reads in the region and additional metadata.
  269. Returns a dict with the following attributes:
  270. data - a list of reads with the format
  271. [<guid>, <start>, <end>, <name>, <read_1>, <read_2>]
  272. where <read_1> has the format
  273. [<start>, <end>, <cigar>, ?<read_seq>?]
  274. and <read_2> has the format
  275. [<start>, <end>, <cigar>, ?<read_seq>?]
  276. For single-end reads, read has format:
  277. [<guid>, <start>, <end>, <name>, cigar, seq]
  278. NOTE: read end and sequence data are not valid for reads outside of
  279. requested region and should not be used.
  280. max_low - lowest coordinate for the returned reads
  281. max_high - highest coordinate for the returned reads
  282. message - error/informative message
  283. """
  284. start, end = int(start), int(end)
  285. orig_data_filename = self.original_dataset.file_name
  286. index_filename = self.converted_dataset.file_name
  287. no_detail = "no_detail" in kwargs
  288. # Attempt to open the BAM file with index
  289. bamfile = csamtools.Samfile( filename=orig_data_filename, mode='rb', index_filename=index_filename )
  290. message = None
  291. try:
  292. data = bamfile.fetch(start=start, end=end, reference=chrom)
  293. except ValueError, e:
  294. # Some BAM files do not prefix chromosome names with chr, try without
  295. if chrom.startswith( 'chr' ):
  296. try:
  297. data = bamfile.fetch( start=start, end=end, reference=chrom[3:] )
  298. except ValueError:
  299. return None
  300. else:
  301. return None
  302. # Encode reads as list of lists.
  303. results = []
  304. paired_pending = {}
  305. for count, read in enumerate( data ):
  306. if count < start_val:
  307. continue
  308. if count-start_val >= max_vals:
  309. message = ERROR_MAX_VALS % ( max_vals, "reads" )
  310. break
  311. qname = read.qname
  312. seq = read.seq
  313. if read.cigar is not None:
  314. read_len = sum( [cig[1] for cig in read.cigar] ) # Use cigar to determine length
  315. else:
  316. read_len = len(seq) # If no cigar, just use sequence length
  317. if read.is_proper_pair:
  318. if qname in paired_pending: # one in dict is always first
  319. pair = paired_pending[qname]
  320. results.append( [ "%i_%s" % ( pair['start'], qname ),
  321. pair['start'],
  322. read.pos + read_len,
  323. qname,
  324. [ pair['start'], pair['end'], pair['cigar'], pair['seq'] ],
  325. [ read.pos, read.pos + read_len, read.cigar, seq ]
  326. ] )
  327. del paired_pending[qname]
  328. else:
  329. paired_pending[qname] = { 'start': read.pos, 'end': read.pos + read_len, 'seq': seq, 'mate_start': read.mpos, 'rlen': read_len, 'cigar': read.cigar }
  330. else:
  331. results.append( [ "%i_%s" % ( read.pos, qname ), read.pos, read.pos + read_len, qname, read.cigar, read.seq] )
  332. # Take care of reads whose mates are out of range.
  333. # TODO: count paired reads when adhering to max_vals?
  334. for qname, read in paired_pending.iteritems():
  335. if read['mate_start'] < read['start']:
  336. # Mate is before read.
  337. read_start = read['mate_start']
  338. read_end = read['end']
  339. # Make read_1 start=end so that length is 0 b/c we don't know
  340. # read length.
  341. r1 = [ read['mate_start'], read['mate_start'] ]
  342. r2 = [ read['start'], read['end'], read['cigar'], read['seq'] ]
  343. else:
  344. # Mate is after read.
  345. read_start = read['start']
  346. # Make read_2 start=end so that length is 0 b/c we don't know
  347. # read length. Hence, end of read is start of read_2.
  348. read_end = read['mate_start']
  349. r1 = [ read['start'], read['end'], read['cigar'], read['seq'] ]
  350. r2 = [ read['mate_start'], read['mate_start'] ]
  351. results.append( [ "%i_%s" % ( read_start, qname ), read_start, read_end, qname, r1, r2 ] )
  352. # Clean up.
  353. bamfile.close()
  354. max_low, max_high = get_bounds( results, 1, 2 )
  355. return { 'data': results, 'message': message, 'max_low': max_low, 'max_high': max_high }
  356. class BBIDataProvider( TracksDataProvider ):
  357. """
  358. BBI data provider for the Galaxy track browser.
  359. """
  360. def valid_chroms( self ):
  361. # No way to return this info as of now
  362. return None
  363. def has_data( self, chrom ):
  364. f, bbi = self._get_dataset()
  365. all_dat = bbi.query(chrom, 0, 2147483647, 1)
  366. f.close()
  367. return all_dat is not None
  368. def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
  369. # Bigwig has the possibility of it being a standalone bigwig file, in which case we use
  370. # original_dataset, or coming from wig->bigwig conversion in which we use converted_dataset
  371. f, bbi = self._get_dataset()
  372. if 'stats' in kwargs:
  373. all_dat = bbi.query(chrom, 0, 2147483647, 1)
  374. f.close()
  375. if all_dat is None:
  376. return None
  377. all_dat = all_dat[0] # only 1 summary
  378. return { 'data' : { 'max': float( all_dat['max'] ), \
  379. 'min': float( all_dat['min'] ), \
  380. 'total_frequency': float( all_dat['coverage'] ) } \
  381. }
  382. start = int(start)
  383. end = int(end)
  384. # The first zoom level for BBI files is 640. If too much is requested, it will look at each block instead
  385. # of summaries. The calculation done is: zoom <> (end-start)/num_points/2.
  386. # Thus, the optimal number of points is (end-start)/num_points/2 = 640
  387. # num_points = (end-start) / 1280
  388. num_points = (end-start) / 1280
  389. if num_points < 1:
  390. num_points = end - start
  391. else:
  392. num_points = min(num_points, 500)
  393. data = bbi.query(chrom, start, end, num_points)
  394. f.close()
  395. pos = start
  396. step_size = (end - start) / num_points
  397. result = []
  398. if data:
  399. for dat_dict in data:
  400. result.append( (pos, float_nan(dat_dict['mean']) ) )
  401. pos += step_size
  402. return { 'data': result }
  403. class BigBedDataProvider( BBIDataProvider ):
  404. def _get_dataset( self ):
  405. # Nothing converts to bigBed so we don't consider converted dataset
  406. f = open( self.original_dataset.file_name )
  407. return f, BigBedFile(file=f)
  408. class BigWigDataProvider (BBIDataProvider ):
  409. def _get_dataset( self ):
  410. if self.converted_dataset is not None:
  411. f = open( self.converted_dataset.file_name )
  412. else:
  413. f = open( self.original_dataset.file_name )
  414. return f, BigWigFile(file=f)
  415. class FilterableMixin:
  416. def get_filters( self ):
  417. """ Returns a dataset's filters. """
  418. # is_ functions taken from Tabular.set_meta
  419. def is_int( column_text ):
  420. try:
  421. int( column_text )
  422. return True
  423. except:
  424. return False
  425. def is_float( column_text ):
  426. try:
  427. float( column_text )
  428. return True
  429. except:
  430. if column_text.strip().lower() == 'na':
  431. return True #na is special cased to be a float
  432. return False
  433. #
  434. # Get filters.
  435. # TODOs:
  436. # (a) might be useful to move this into each datatype's set_meta method;
  437. # (b) could look at first N lines to ensure GTF attribute types are consistent.
  438. #
  439. filters = []
  440. # HACK: first 8 fields are for drawing, so start filter column index at 9.
  441. filter_col = 8
  442. if isinstance( self.original_dataset.datatype, Gff ):
  443. # Can filter by score and GTF attributes.
  444. filters = [ { 'name': 'Score',
  445. 'type': 'int',
  446. 'index': filter_col,
  447. 'tool_id': 'Filter1',
  448. 'tool_exp_name': 'c6' } ]
  449. filter_col += 1
  450. if isinstance( self.original_dataset.datatype, Gtf ):
  451. # Create filters based on dataset metadata.
  452. for name, a_type in self.original_dataset.metadata.attribute_types.items():
  453. if a_type in [ 'int', 'float' ]:
  454. filters.append(
  455. { 'name': name,
  456. 'type': a_type,
  457. 'index': filter_col,
  458. 'tool_id': 'gff_filter_by_attribute',
  459. 'tool_exp_name': name } )
  460. filter_col += 1
  461. '''
  462. # Old code: use first line in dataset to find attributes.
  463. for i, line in enumerate( open(self.original_dataset.file_name) ):
  464. if not line.startswith('#'):
  465. # Look at first line for attributes and types.
  466. attributes = parse_gff_attributes( line.split('\t')[8] )
  467. for attr, value in attributes.items():
  468. # Get attribute type.
  469. if is_int( value ):
  470. attr_type = 'int'
  471. elif is_float( value ):
  472. attr_type = 'float'
  473. else:
  474. attr_type = 'str'
  475. # Add to filters.
  476. if attr_type is not 'str':
  477. filters.append( { 'name': attr, 'type': attr_type, 'index': filter_col } )
  478. filter_col += 1
  479. break
  480. '''
  481. elif isinstance( self.original_dataset.datatype, Bed ):
  482. # Can filter by score column only.
  483. filters = [ { 'name': 'Score',
  484. 'type': 'int',
  485. 'index': filter_col,
  486. 'tool_id': 'Filter1',
  487. 'tool_exp_name': 'c5'
  488. } ]
  489. return filters
  490. class TabixDataProvider( FilterableMixin, TracksDataProvider ):
  491. """
  492. Tabix index data provider for the Galaxy track browser.
  493. """
  494. col_name_data_attr_mapping = { 4 : { 'index': 4 , 'name' : 'Score' } }
  495. def get_iterator( self, chrom, start, end ):
  496. start, end = int(start), int(end)
  497. if end >= (2<<29):
  498. end = (2<<29 - 1) # Tabix-enforced maximum
  499. bgzip_fname = self.dependencies['bgzip'].file_name
  500. # if os.path.getsize(self.converted_dataset.file_name) == 0:
  501. # return { 'kind': messages.ERROR, 'message': "Tabix converted size was 0, meaning the input file had invalid values." }
  502. tabix = ctabix.Tabixfile(bgzip_fname, index_filename=self.converted_dataset.file_name)
  503. # If chrom is not found in indexes, try removing the first three
  504. # characters (e.g. 'chr') and see if that works. This enables the
  505. # provider to handle chrome names defined as chrXXX and as XXX.
  506. chrom = str(chrom)
  507. if chrom not in tabix.contigs and chrom.startswith("chr") and (chrom[3:] in tabix.contigs):
  508. chrom = chrom[3:]
  509. return tabix.fetch(reference=chrom, start=start, end=end)
  510. def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ):
  511. iterator = self.get_iterator( chrom, start, end )
  512. return self.process_data( iterator, start_val, max_vals, **kwargs )
  513. class IntervalIndexDataProvider( FilterableMixin, TracksDataProvider ):
  514. """
  515. Interval index files used only for GFF files.
  516. """
  517. col_name_data_attr_mapping = { 4 : { 'index': 4 , 'name' : 'Score' } }
  518. def write_data_to_file( self, chrom, start, end, filename ):
  519. source = open( self.original_dataset.file_name )
  520. index = Indexes( self.converted_dataset.file_name )
  521. out = open( filename, 'w' )
  522. for start, end, offset in index.find(chrom, start, end):
  523. source.seek( offset )
  524. reader = GFFReaderWrapper( source, fix_strand=True )
  525. feature = reader.next()
  526. for interval in feature.intervals:
  527. out.write(interval.raw_line + '\n')
  528. out.close()
  529. def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ):
  530. start, end = int(start), int(end)
  531. source = open( self.original_dataset.file_name )
  532. index = Indexes( self.converted_dataset.file_name )
  533. results = []
  534. message = None
  535. # If chrom is not found in indexes, try removing the first three
  536. # characters (e.g. 'chr') and see if that works. This enables the
  537. # provider to handle chrome names defined as chrXXX and as XXX.
  538. chrom = str(chrom)
  539. if chrom not in index.indexes and chrom[3:] in index.indexes:
  540. chrom = chrom[3:]
  541. #
  542. # Build data to return. Payload format is:
  543. # [ <guid/offset>, <start>, <end>, <name>, <score>, <strand>, <thick_start>,
  544. # <thick_end>, <blocks> ]
  545. #
  546. # First three entries are mandatory, others are optional.
  547. #
  548. filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) )
  549. no_detail = ( "no_detail" in kwargs )
  550. for count, val in enumerate( index.find(chrom, start, end) ):
  551. start, end, offset = val[0], val[1], val[2]
  552. if count < start_val:
  553. continue
  554. if count-start_val >= max_vals:
  555. message = ERROR_MAX_VALS % ( max_vals, "features" )
  556. break
  557. source.seek( offset )
  558. # TODO: can we use column metadata to fill out payload?
  559. # GFF dataset.
  560. reader = GFFReaderWrapper( source, fix_strand=True )
  561. feature = reader.next()
  562. payload = package_gff_feature( feature, no_detail, filter_cols )
  563. payload.insert( 0, offset )
  564. results.append( payload )
  565. return { 'data': results, 'message': message }
  566. class VcfDataProvider( TabixDataProvider ):
  567. """
  568. VCF data provider for the Galaxy track browser.
  569. Payload format:
  570. [ uid (offset), start, end, ID, reference base(s), alternate base(s), quality score ]
  571. """
  572. col_name_data_attr_mapping = { 'Qual' : { 'index': 6 , 'name' : 'Qual' } }
  573. def process_data( self, iterator, start_val=0, max_vals=sys.maxint, **kwargs ):
  574. rval = []
  575. message = None
  576. for count, line in enumerate( iterator ):
  577. if count < start_val:
  578. continue
  579. if count-start_val >= max_vals:
  580. message = ERROR_MAX_VALS % ( "max_vals", "features" )
  581. break
  582. feature = line.split()
  583. payload = [ hash(line), int(feature[1])-1, int(feature[1]),
  584. # ID:
  585. feature[2],
  586. # reference base(s):
  587. feature[3],
  588. # alternative base(s)
  589. feature[4],
  590. # phred quality score
  591. float( feature[5] )]
  592. rval.append(payload)
  593. return { 'data': rval, 'message': message }
  594. class GFFDataProvider( TracksDataProvider ):
  595. """
  596. Provide data from GFF file.
  597. NOTE: this data provider does not use indices, and hence will be very slow
  598. for large datasets.
  599. """
  600. def get_data( self, chrom, start, end, start_val=0, max_vals=sys.maxint, **kwargs ):
  601. start, end = int( start ), int( end )
  602. source = open( self.original_dataset.file_name )
  603. results = []
  604. message = None
  605. offset = 0
  606. for count, feature in enumerate( GFFReaderWrapper( source, fix_strand=True ) ):
  607. if count < start_val:
  608. continue
  609. if count-start_val >= max_vals:
  610. message = ERROR_MAX_VALS % ( max_vals, "reads" )
  611. break
  612. feature_start, feature_end = convert_gff_coords_to_bed( [ feature.start, feature.end ] )
  613. if feature.chrom != chrom or feature_start < start or feature_end > end:
  614. continue
  615. payload = package_gff_feature( feature )
  616. payload.insert( 0, offset )
  617. results.append( payload )
  618. offset += feature.raw_size
  619. return { 'data': results, 'message': message }
  620. class BedTabixDataProvider( TabixDataProvider, BedDataProvider ):
  621. """
  622. Provides data from a BED file indexed via tabix.
  623. """
  624. pass
  625. class RawBedDataProvider( BedDataProvider ):
  626. """
  627. Provide data from BED file.
  628. NOTE: this data provider does not use indices, and hence will be very slow
  629. for large datasets.
  630. """
  631. def get_iterator( self, chrom, start, end ):
  632. def line_filter_iter():
  633. for line in open( self.original_dataset.file_name ):
  634. feature = line.split()
  635. feature_chrom, feature_start, feature_end = feature[ 0:3 ]
  636. if feature_chrom != chrom or feature_start > end or feature_end < start:
  637. continue
  638. yield line
  639. return line_filter_iter()
  640. #
  641. # Helper methods.
  642. #
  643. # Mapping from dataset type name to a class that can fetch data from a file of that
  644. # type. First key is converted dataset type; if result is another dict, second key
  645. # is original dataset type. TODO: This needs to be more flexible.
  646. dataset_type_name_to_data_provider = {
  647. "tabix": { Vcf: VcfDataProvider, Bed: BedTabixDataProvider, "default" : TabixDataProvider },
  648. "interval_index": IntervalIndexDataProvider,
  649. "bai": BamDataProvider,
  650. "summary_tree": SummaryTreeDataProvider,
  651. "bigwig": BigWigDataProvider,
  652. "bigbed": BigBedDataProvider
  653. }
  654. def get_data_provider( name=None, original_dataset=None ):
  655. """
  656. Returns data provider class by name and/or original dataset.
  657. """
  658. data_provider = None
  659. if name:
  660. value = dataset_type_name_to_data_provider[ name ]
  661. if isinstance( value, dict ):
  662. # Get converter by dataset extension; if there is no data provider,
  663. # get the default.
  664. data_provider = value.get( original_dataset.datatype.__class__, value.get( "default" ) )
  665. else:
  666. data_provider = value
  667. elif original_dataset:
  668. # Look up data provider from datatype's informaton.
  669. try:
  670. # Get data provider mapping and data provider for 'data'. If
  671. # provider available, use it; otherwise use generic provider.
  672. _ , data_provider_mapping = original_dataset.datatype.get_track_type()
  673. if 'data_standalone' in data_provider_mapping:
  674. data_provider_name = data_provider_mapping[ 'data_standalone' ]
  675. else:
  676. data_provider_name = data_provider_mapping[ 'data' ]
  677. if data_provider_name:
  678. data_provider = get_data_provider( name=data_provider_name, original_dataset=original_dataset )
  679. else:
  680. data_provider = TracksDataProvider
  681. except:
  682. pass
  683. return data_provider
  684. def package_gff_feature( feature, no_detail=False, filter_cols=[] ):
  685. """ Package a GFF feature in an array for data providers. """
  686. feature = convert_gff_coords_to_bed( feature )
  687. # No detail means only start, end.
  688. if no_detail:
  689. return [ feature.start, feature.end ]
  690. # Return full feature.
  691. payload = [ feature.start,
  692. feature.end,
  693. feature.name(),
  694. feature.strand,
  695. # No notion of thick start, end in GFF, so make everything
  696. # thick.
  697. feature.start,
  698. feature.end
  699. ]
  700. # HACK: remove interval with name 'transcript' from feature.
  701. # Cufflinks puts this interval in each of its transcripts,
  702. # and they mess up trackster by covering the feature's blocks.
  703. # This interval will always be a feature's first interval,
  704. # and the GFF's third column is its feature name.
  705. if feature.intervals[0].fields[2] == 'transcript':
  706. feature.intervals = feature.intervals[1:]
  707. # Add blocks.
  708. block_sizes = [ (interval.end - interval.start ) for interval in feature.intervals ]
  709. block_starts = [ ( interval.start - feature.start ) for interval in feature.intervals ]
  710. blocks = zip( block_sizes, block_starts )
  711. payload.append( [ ( feature.start + block[1], feature.start + block[1] + block[0] ) for block in blocks ] )
  712. # Add filter data to payload.
  713. for col in filter_cols:
  714. if col == "Score":
  715. payload.append( feature.score )
  716. elif col in feature.attributes:
  717. payload.append( feature.attributes[col] )
  718. else:
  719. # Dummy value.
  720. payload.append( "na" )
  721. return payload