PageRenderTime 59ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/datatypes/interval.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 1400 lines | 1296 code | 20 blank | 84 comment | 119 complexity | 73bdc523958f8956b87c680c31b83fec MD5 | raw file
  1. """
  2. Interval datatypes
  3. """
  4. import pkg_resources
  5. pkg_resources.require( "bx-python" )
  6. import logging, os, sys, tempfile
  7. import data
  8. from galaxy import util
  9. from galaxy.datatypes.sniff import *
  10. from galaxy.web import url_for
  11. import urllib
  12. from bx.intervals.io import *
  13. from galaxy.datatypes import metadata
  14. from galaxy.datatypes.metadata import MetadataElement
  15. from galaxy.datatypes.tabular import Tabular
  16. from galaxy.datatypes.util.gff_util import parse_gff_attributes
  17. import math
  18. import dataproviders
  19. log = logging.getLogger(__name__)
  20. # Contains the meta columns and the words that map to it; list aliases on the
  21. # right side of the : in decreasing order of priority
  22. alias_spec = {
  23. 'chromCol' : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name' ],
  24. 'startCol' : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)' ],
  25. 'endCol' : [ 'end' , 'END' , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)' ],
  26. 'strandCol' : [ 'strand', 'STRAND', 'Strand' ],
  27. 'nameCol' : [ 'name', 'NAME', 'Name', 'name2', 'NAME2', 'Name2', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Ensembl Peptide ID' ]
  28. }
  29. # a little faster lookup
  30. alias_helper = {}
  31. for key, value in alias_spec.items():
  32. for elem in value:
  33. alias_helper[elem] = key
  34. # Constants for configuring viewport generation: If a line is greater than
  35. # VIEWPORT_MAX_READS_PER_LINE * VIEWPORT_READLINE_BUFFER_SIZE bytes in size,
  36. # then we will not generate a viewport for that dataset
  37. VIEWPORT_READLINE_BUFFER_SIZE = 1048576 # 1MB
  38. VIEWPORT_MAX_READS_PER_LINE = 10
  39. @dataproviders.decorators.has_dataproviders
  40. class Interval( Tabular ):
  41. """Tab delimited data containing interval information"""
  42. file_ext = "interval"
  43. line_class = "region"
  44. track_type = "FeatureTrack"
  45. data_sources = { "data": "tabix", "index": "bigwig" }
  46. """Add metadata elements"""
  47. MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
  48. MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
  49. MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter )
  50. MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
  51. MetadataElement( name="nameCol", desc="Name/Identifier column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
  52. MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
  53. def __init__(self, **kwd):
  54. """Initialize interval datatype, by adding UCSC display apps"""
  55. Tabular.__init__(self, **kwd)
  56. self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
  57. def init_meta( self, dataset, copy_from=None ):
  58. Tabular.init_meta( self, dataset, copy_from=copy_from )
  59. def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ):
  60. """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand"""
  61. Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 )
  62. if dataset.has_data():
  63. empty_line_count = 0
  64. num_check_lines = 100 # only check up to this many non empty lines
  65. for i, line in enumerate( file( dataset.file_name ) ):
  66. line = line.rstrip( '\r\n' )
  67. if line:
  68. if ( first_line_is_header or line[0] == '#' ):
  69. self.init_meta( dataset )
  70. line = line.strip( '#' )
  71. elems = line.split( '\t' )
  72. for meta_name, header_list in alias_spec.iteritems():
  73. for header_val in header_list:
  74. if header_val in elems:
  75. #found highest priority header to meta_name
  76. setattr( dataset.metadata, meta_name, elems.index( header_val ) + 1 )
  77. break #next meta_name
  78. break # Our metadata is set, so break out of the outer loop
  79. else:
  80. # Header lines in Interval files are optional. For example, BED is Interval but has no header.
  81. # We'll make a best guess at the location of the metadata columns.
  82. metadata_is_set = False
  83. elems = line.split( '\t' )
  84. if len( elems ) > 2:
  85. for str in data.col1_startswith:
  86. if line.lower().startswith( str ):
  87. if overwrite or not dataset.metadata.element_is_set( 'chromCol' ):
  88. dataset.metadata.chromCol = 1
  89. try:
  90. int( elems[1] )
  91. if overwrite or not dataset.metadata.element_is_set( 'startCol' ):
  92. dataset.metadata.startCol = 2
  93. except:
  94. pass # Metadata default will be used
  95. try:
  96. int( elems[2] )
  97. if overwrite or not dataset.metadata.element_is_set( 'endCol' ):
  98. dataset.metadata.endCol = 3
  99. except:
  100. pass # Metadata default will be used
  101. #we no longer want to guess that this column is the 'name', name must now be set manually for interval files
  102. #we will still guess at the strand, as we can make a more educated guess
  103. #if len( elems ) > 3:
  104. # try:
  105. # int( elems[3] )
  106. # except:
  107. # if overwrite or not dataset.metadata.element_is_set( 'nameCol' ):
  108. # dataset.metadata.nameCol = 4
  109. if len( elems ) < 6 or elems[5] not in data.valid_strand:
  110. if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
  111. dataset.metadata.strandCol = 0
  112. else:
  113. if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
  114. dataset.metadata.strandCol = 6
  115. metadata_is_set = True
  116. break
  117. if metadata_is_set or ( i - empty_line_count ) > num_check_lines:
  118. break # Our metadata is set or we examined 100 non-empty lines, so break out of the outer loop
  119. else:
  120. empty_line_count += 1
  121. def displayable( self, dataset ):
  122. try:
  123. return dataset.has_data() \
  124. and dataset.state == dataset.states.OK \
  125. and dataset.metadata.columns > 0 \
  126. and dataset.metadata.data_lines != 0 \
  127. and dataset.metadata.chromCol \
  128. and dataset.metadata.startCol \
  129. and dataset.metadata.endCol
  130. except:
  131. return False
  132. def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ):
  133. """Return a chrom, start, stop tuple for viewing a file."""
  134. viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
  135. max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
  136. if not self.displayable( dataset ):
  137. return ( None, None, None )
  138. try:
  139. # If column indexes were not passwed, determine from metadata
  140. if chrom_col is None:
  141. chrom_col = int( dataset.metadata.chromCol ) - 1
  142. if start_col is None:
  143. start_col = int( dataset.metadata.startCol ) - 1
  144. if end_col is None:
  145. end_col = int( dataset.metadata.endCol ) - 1
  146. # Scan lines of file to find a reasonable chromosome and range
  147. chrom = None
  148. start = sys.maxint
  149. end = 0
  150. max_col = max( chrom_col, start_col, end_col )
  151. fh = open( dataset.file_name )
  152. while True:
  153. line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
  154. # Stop if at end of file
  155. if not line:
  156. break
  157. # Skip comment lines
  158. if not line.startswith( '#' ):
  159. try:
  160. fields = line.rstrip().split( '\t' )
  161. if len( fields ) > max_col:
  162. if chrom is None or chrom == fields[ chrom_col ]:
  163. start = min( start, int( fields[ start_col ] ) )
  164. end = max( end, int( fields[ end_col ] ) )
  165. # Set chrom last, in case start and end are not integers
  166. chrom = fields[ chrom_col ]
  167. viewport_feature_count -= 1
  168. except Exception, e:
  169. # Most likely a non-integer field has been encountered
  170. # for start / stop. Just ignore and make sure we finish
  171. # reading the line and decrementing the counters.
  172. pass
  173. # Make sure we are at the next new line
  174. readline_count = VIEWPORT_MAX_READS_PER_LINE
  175. while line.rstrip( '\n\r' ) == line:
  176. assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
  177. line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
  178. if not line: break #EOF
  179. readline_count -= 1
  180. max_line_count -= 1
  181. if not viewport_feature_count or not max_line_count:
  182. #exceeded viewport or total line count to check
  183. break
  184. if chrom is not None:
  185. return ( chrom, str( start ), str( end ) ) # Necessary to return strings?
  186. except Exception, e:
  187. # Unexpected error, possibly missing metadata
  188. log.exception( "Exception caught attempting to generate viewport for dataset '%d'", dataset.id )
  189. return ( None, None, None )
  190. def as_ucsc_display_file( self, dataset, **kwd ):
  191. """Returns file contents with only the bed data"""
  192. fd, temp_name = tempfile.mkstemp()
  193. c, s, e, t, n = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol, dataset.metadata.strandCol or 0, dataset.metadata.nameCol or 0
  194. c, s, e, t, n = int(c)-1, int(s)-1, int(e)-1, int(t)-1, int(n)-1
  195. if t >= 0: # strand column (should) exists
  196. for i, elems in enumerate( util.file_iter(dataset.file_name) ):
  197. strand = "+"
  198. name = "region_%i" % i
  199. if n >= 0 and n < len( elems ): name = elems[n]
  200. if t<len(elems): strand = elems[t]
  201. tmp = [ elems[c], elems[s], elems[e], name, '0', strand ]
  202. os.write(fd, '%s\n' % '\t'.join(tmp) )
  203. elif n >= 0: # name column (should) exists
  204. for i, elems in enumerate( util.file_iter(dataset.file_name) ):
  205. name = "region_%i" % i
  206. if n >= 0 and n < len( elems ): name = elems[n]
  207. tmp = [ elems[c], elems[s], elems[e], name ]
  208. os.write(fd, '%s\n' % '\t'.join(tmp) )
  209. else:
  210. for elems in util.file_iter(dataset.file_name):
  211. tmp = [ elems[c], elems[s], elems[e] ]
  212. os.write(fd, '%s\n' % '\t'.join(tmp) )
  213. os.close(fd)
  214. return open(temp_name)
  215. def display_peek( self, dataset ):
  216. """Returns formated html of peek"""
  217. return Tabular.make_html_table( self, dataset, column_parameter_alias={'chromCol':'Chrom', 'startCol':'Start', 'endCol':'End', 'strandCol':'Strand', 'nameCol':'Name'} )
  218. def ucsc_links( self, dataset, type, app, base_url ):
  219. """
  220. Generate links to UCSC genome browser sites based on the dbkey
  221. and content of dataset.
  222. """
  223. # Filter UCSC sites to only those that are supported by this build and
  224. # enabled.
  225. valid_sites = [ ( name, url )
  226. for name, url in util.get_ucsc_by_build( dataset.dbkey )
  227. if name in app.config.ucsc_display_sites ]
  228. if not valid_sites:
  229. return []
  230. # If there are any valid sites, we need to generate the estimated
  231. # viewport
  232. chrom, start, stop = self.get_estimated_display_viewport( dataset )
  233. if chrom is None:
  234. return []
  235. # Accumulate links for valid sites
  236. ret_val = []
  237. for site_name, site_url in valid_sites:
  238. internal_url = url_for( controller='dataset', dataset_id=dataset.id,
  239. action='display_at', filename='ucsc_' + site_name )
  240. display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at"
  241. % (base_url, url_for( controller='root' ), dataset.id, type) )
  242. redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s"
  243. % (site_url, dataset.dbkey, chrom, start, stop ) )
  244. link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
  245. ret_val.append( ( site_name, link ) )
  246. return ret_val
  247. def validate( self, dataset ):
  248. """Validate an interval file using the bx GenomicIntervalReader"""
  249. errors = list()
  250. c, s, e, t = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol, dataset.metadata.strandCol
  251. c, s, e, t = int(c)-1, int(s)-1, int(e)-1, int(t)-1
  252. infile = open(dataset.file_name, "r")
  253. reader = GenomicIntervalReader(
  254. infile,
  255. chrom_col = c,
  256. start_col = s,
  257. end_col = e,
  258. strand_col = t)
  259. while True:
  260. try:
  261. reader.next()
  262. except ParseError, e:
  263. errors.append(e)
  264. except StopIteration:
  265. infile.close()
  266. return errors
  267. def repair_methods( self, dataset ):
  268. """Return options for removing errors along with a description"""
  269. return [("lines","Remove erroneous lines")]
  270. def sniff( self, filename ):
  271. """
  272. Checks for 'intervalness'
  273. This format is mostly used by galaxy itself. Valid interval files should include
  274. a valid header comment, but this seems to be loosely regulated.
  275. >>> fname = get_test_fname( 'test_space.txt' )
  276. >>> Interval().sniff( fname )
  277. False
  278. >>> fname = get_test_fname( 'interval.interval' )
  279. >>> Interval().sniff( fname )
  280. True
  281. """
  282. headers = get_headers( filename, '\t' )
  283. try:
  284. """
  285. If we got here, we already know the file is_column_based and is not bed,
  286. so we'll just look for some valid data.
  287. """
  288. for hdr in headers:
  289. if hdr and not hdr[0].startswith( '#' ):
  290. if len(hdr) < 3:
  291. return False
  292. try:
  293. # Assume chrom start and end are in column positions 1 and 2
  294. # respectively ( for 0 based columns )
  295. check = int( hdr[1] )
  296. check = int( hdr[2] )
  297. except:
  298. return False
  299. return True
  300. except:
  301. return False
  302. def get_track_window(self, dataset, data, start, end):
  303. """
  304. Assumes the incoming track data is sorted already.
  305. """
  306. window = list()
  307. for record in data:
  308. fields = record.rstrip("\n\r").split("\t")
  309. record_chrom = fields[dataset.metadata.chromCol-1]
  310. record_start = int(fields[dataset.metadata.startCol-1])
  311. record_end = int(fields[dataset.metadata.endCol-1])
  312. if record_start < end and record_end > start:
  313. window.append( (record_chrom, record_start, record_end) ) #Yes I did want to use a generator here, but it doesn't work downstream
  314. return window
  315. def get_track_resolution( self, dataset, start, end):
  316. return None
  317. # ------------- Dataproviders
  318. @dataproviders.decorators.dataprovider_factory( 'genomic-region',
  319. dataproviders.dataset.GenomicRegionDataProvider.settings )
  320. def genomic_region_dataprovider( self, dataset, **settings ):
  321. return dataproviders.dataset.GenomicRegionDataProvider( dataset, **settings )
  322. @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
  323. dataproviders.dataset.GenomicRegionDataProvider.settings )
  324. def genomic_region_dict_dataprovider( self, dataset, **settings ):
  325. settings[ 'named_columns' ] = True
  326. return self.genomic_region_dataprovider( dataset, **settings )
  327. @dataproviders.decorators.dataprovider_factory( 'interval',
  328. dataproviders.dataset.IntervalDataProvider.settings )
  329. def interval_dataprovider( self, dataset, **settings ):
  330. return dataproviders.dataset.IntervalDataProvider( dataset, **settings )
  331. @dataproviders.decorators.dataprovider_factory( 'interval-dict',
  332. dataproviders.dataset.IntervalDataProvider.settings )
  333. def interval_dict_dataprovider( self, dataset, **settings ):
  334. settings[ 'named_columns' ] = True
  335. return self.interval_dataprovider( dataset, **settings )
  336. class BedGraph( Interval ):
  337. """Tab delimited chrom/start/end/datavalue dataset"""
  338. file_ext = "bedgraph"
  339. track_type = "LineTrack"
  340. data_sources = { "data": "bigwig", "index": "bigwig" }
  341. def as_ucsc_display_file( self, dataset, **kwd ):
  342. """
  343. Returns file contents as is with no modifications.
  344. TODO: this is a functional stub and will need to be enhanced moving forward to provide additional support for bedgraph.
  345. """
  346. return open( dataset.file_name )
  347. def get_estimated_display_viewport( self, dataset, chrom_col = 0, start_col = 1, end_col = 2 ):
  348. """
  349. Set viewport based on dataset's first 100 lines.
  350. """
  351. return Interval.get_estimated_display_viewport( self, dataset, chrom_col = chrom_col, start_col = start_col, end_col = end_col )
  352. class Bed( Interval ):
  353. """Tab delimited data in BED format"""
  354. file_ext = "bed"
  355. data_sources = { "data": "tabix", "index": "bigwig", "feature_search": "fli" }
  356. track_type = Interval.track_type
  357. """Add metadata elements"""
  358. MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
  359. MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
  360. MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter )
  361. MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
  362. MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
  363. MetadataElement( name="viz_filter_cols", desc="Score column for visualization", default=[4], param=metadata.ColumnParameter, optional=True, multiple=True )
  364. ###do we need to repeat these? they are the same as should be inherited from interval type
  365. def set_meta( self, dataset, overwrite = True, **kwd ):
  366. """Sets the metadata information for datasets previously determined to be in bed format."""
  367. i = 0
  368. if dataset.has_data():
  369. for i, line in enumerate( file(dataset.file_name) ):
  370. metadata_set = False
  371. line = line.rstrip('\r\n')
  372. if line and not line.startswith('#'):
  373. elems = line.split('\t')
  374. if len(elems) > 2:
  375. for startswith in data.col1_startswith:
  376. if line.lower().startswith( startswith ):
  377. if len( elems ) > 3:
  378. if overwrite or not dataset.metadata.element_is_set( 'nameCol' ):
  379. dataset.metadata.nameCol = 4
  380. if len(elems) < 6:
  381. if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
  382. dataset.metadata.strandCol = 0
  383. else:
  384. if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
  385. dataset.metadata.strandCol = 6
  386. metadata_set = True
  387. break
  388. if metadata_set: break
  389. Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
  390. def as_ucsc_display_file( self, dataset, **kwd ):
  391. """Returns file contents with only the bed data. If bed 6+, treat as interval."""
  392. for line in open(dataset.file_name):
  393. line = line.strip()
  394. if line == "" or line.startswith("#"):
  395. continue
  396. fields = line.split('\t')
  397. """check to see if this file doesn't conform to strict genome browser accepted bed"""
  398. try:
  399. if len(fields) > 12:
  400. return Interval.as_ucsc_display_file(self, dataset) #too many fields
  401. if len(fields) > 6:
  402. int(fields[6])
  403. if len(fields) > 7:
  404. int(fields[7])
  405. if len(fields) > 8:
  406. if int(fields[8]) != 0:
  407. return Interval.as_ucsc_display_file(self, dataset)
  408. if len(fields) > 9:
  409. int(fields[9])
  410. if len(fields) > 10:
  411. fields2 = fields[10].rstrip(",").split(",") #remove trailing comma and split on comma
  412. for field in fields2:
  413. int(field)
  414. if len(fields) > 11:
  415. fields2 = fields[11].rstrip(",").split(",") #remove trailing comma and split on comma
  416. for field in fields2:
  417. int(field)
  418. except: return Interval.as_ucsc_display_file(self, dataset)
  419. #only check first line for proper form
  420. break
  421. try: return open(dataset.file_name)
  422. except: return "This item contains no content"
  423. def sniff( self, filename ):
  424. """
  425. Checks for 'bedness'
  426. BED lines have three required fields and nine additional optional fields.
  427. The number of fields per line must be consistent throughout any single set of data in
  428. an annotation track. The order of the optional fields is binding: lower-numbered
  429. fields must always be populated if higher-numbered fields are used. The data type of
  430. all 12 columns is:
  431. 1-str, 2-int, 3-int, 4-str, 5-int, 6-str, 7-int, 8-int, 9-int or list, 10-int, 11-list, 12-list
  432. For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format1
  433. >>> fname = get_test_fname( 'test_tab.bed' )
  434. >>> Bed().sniff( fname )
  435. True
  436. >>> fname = get_test_fname( 'interval1.bed' )
  437. >>> Bed().sniff( fname )
  438. True
  439. >>> fname = get_test_fname( 'complete.bed' )
  440. >>> Bed().sniff( fname )
  441. True
  442. """
  443. headers = get_headers( filename, '\t' )
  444. try:
  445. if not headers: return False
  446. for hdr in headers:
  447. if (hdr[0] == '' or hdr[0].startswith( '#' )):
  448. continue
  449. valid_col1 = False
  450. if len(hdr) < 3 or len(hdr) > 12:
  451. return False
  452. for str in data.col1_startswith:
  453. if hdr[0].lower().startswith(str):
  454. valid_col1 = True
  455. break
  456. if valid_col1:
  457. try:
  458. int( hdr[1] )
  459. int( hdr[2] )
  460. except:
  461. return False
  462. if len( hdr ) > 4:
  463. #hdr[3] is a string, 'name', which defines the name of the BED line - difficult to test for this.
  464. #hdr[4] is an int, 'score', a score between 0 and 1000.
  465. try:
  466. if int( hdr[4] ) < 0 or int( hdr[4] ) > 1000: return False
  467. except:
  468. return False
  469. if len( hdr ) > 5:
  470. #hdr[5] is strand
  471. if hdr[5] not in data.valid_strand: return False
  472. if len( hdr ) > 6:
  473. #hdr[6] is thickStart, the starting position at which the feature is drawn thickly.
  474. try: int( hdr[6] )
  475. except: return False
  476. if len( hdr ) > 7:
  477. #hdr[7] is thickEnd, the ending position at which the feature is drawn thickly
  478. try: int( hdr[7] )
  479. except: return False
  480. if len( hdr ) > 8:
  481. #hdr[8] is itemRgb, an RGB value of the form R,G,B (e.g. 255,0,0). However, this could also be an int (e.g., 0)
  482. try: int( hdr[8] )
  483. except:
  484. try: hdr[8].split(',')
  485. except: return False
  486. if len( hdr ) > 9:
  487. #hdr[9] is blockCount, the number of blocks (exons) in the BED line.
  488. try: block_count = int( hdr[9] )
  489. except: return False
  490. if len( hdr ) > 10:
  491. #hdr[10] is blockSizes - A comma-separated list of the block sizes.
  492. #Sometimes the blosck_sizes and block_starts lists end in extra commas
  493. try: block_sizes = hdr[10].rstrip(',').split(',')
  494. except: return False
  495. if len( hdr ) > 11:
  496. #hdr[11] is blockStarts - A comma-separated list of block starts.
  497. try: block_starts = hdr[11].rstrip(',').split(',')
  498. except: return False
  499. if len(block_sizes) != block_count or len(block_starts) != block_count: return False
  500. else: return False
  501. return True
  502. except: return False
  503. class BedStrict( Bed ):
  504. """Tab delimited data in strict BED format - no non-standard columns allowed"""
  505. file_ext = "bedstrict"
  506. #no user change of datatype allowed
  507. allow_datatype_change = False
  508. #Read only metadata elements
  509. MetadataElement( name="chromCol", default=1, desc="Chrom column", readonly=True, param=metadata.MetadataParameter )
  510. MetadataElement( name="startCol", default=2, desc="Start column", readonly=True, param=metadata.MetadataParameter ) #TODO: start and end should be able to be set to these or the proper thick[start/end]?
  511. MetadataElement( name="endCol", default=3, desc="End column", readonly=True, param=metadata.MetadataParameter )
  512. MetadataElement( name="strandCol", desc="Strand column (click box & select)", readonly=True, param=metadata.MetadataParameter, no_value=0, optional=True )
  513. MetadataElement( name="nameCol", desc="Name/Identifier column (click box & select)", readonly=True, param=metadata.MetadataParameter, no_value=0, optional=True )
  514. MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
  515. def __init__( self, **kwd ):
  516. Tabular.__init__( self, **kwd )
  517. self.clear_display_apps() #only new style display applications for this datatype
  518. def set_meta( self, dataset, overwrite = True, **kwd ):
  519. Tabular.set_meta( self, dataset, overwrite = overwrite, **kwd) #need column count first
  520. if dataset.metadata.columns >= 4:
  521. dataset.metadata.nameCol = 4
  522. if dataset.metadata.columns >= 6:
  523. dataset.metadata.strandCol = 6
  524. def sniff( self, filename ):
  525. return False #NOTE: This would require aggressively validating the entire file
  526. class Bed6( BedStrict ):
  527. """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 6"""
  528. file_ext = "bed6"
  529. class Bed12( BedStrict ):
  530. """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 12"""
  531. file_ext = "bed12"
  532. class _RemoteCallMixin:
  533. def _get_remote_call_url( self, redirect_url, site_name, dataset, type, app, base_url ):
  534. """Retrieve the URL to call out to an external site and retrieve data.
  535. This routes our external URL through a local galaxy instance which makes
  536. the data available, followed by redirecting to the remote site with a
  537. link back to the available information.
  538. """
  539. internal_url = "%s" % url_for( controller='dataset', dataset_id=dataset.id, action='display_at', filename='%s_%s' % ( type, site_name ) )
  540. base_url = app.config.get( "display_at_callback", base_url )
  541. display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % \
  542. ( base_url, url_for( controller='root' ), dataset.id, type ) )
  543. link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
  544. return link
  545. @dataproviders.decorators.has_dataproviders
  546. class Gff( Tabular, _RemoteCallMixin ):
  547. """Tab delimited data in Gff format"""
  548. file_ext = "gff"
  549. column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
  550. data_sources = { "data": "interval_index", "index": "bigwig", "feature_search": "fli" }
  551. track_type = Interval.track_type
  552. """Add metadata elements"""
  553. MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False )
  554. MetadataElement( name="column_types", default=['str','str','str','int','int','int','str','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
  555. MetadataElement( name="attributes", default=0, desc="Number of attributes", readonly=True, visible=False, no_value=0 )
  556. MetadataElement( name="attribute_types", default={}, desc="Attribute types", param=metadata.DictParameter, readonly=True, visible=False, no_value=[] )
  557. def __init__( self, **kwd ):
  558. """Initialize datatype, by adding GBrowse display app"""
  559. Tabular.__init__(self, **kwd)
  560. self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
  561. self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' )
  562. def set_attribute_metadata( self, dataset ):
  563. """
  564. Sets metadata elements for dataset's attributes.
  565. """
  566. # Use first N lines to set metadata for dataset attributes. Attributes
  567. # not found in the first N lines will not have metadata.
  568. num_lines = 200
  569. attribute_types = {}
  570. for i, line in enumerate( file ( dataset.file_name ) ):
  571. if line and not line.startswith( '#' ):
  572. elems = line.split( '\t' )
  573. if len( elems ) == 9:
  574. try:
  575. # Loop through attributes to set types.
  576. for name, value in parse_gff_attributes( elems[8] ).items():
  577. # Default type is string.
  578. value_type = "str"
  579. try:
  580. # Try int.
  581. int( value )
  582. value_type = "int"
  583. except:
  584. try:
  585. # Try float.
  586. float( value )
  587. value_type = "float"
  588. except:
  589. pass
  590. attribute_types[ name ] = value_type
  591. except:
  592. pass
  593. if i + 1 == num_lines:
  594. break
  595. # Set attribute metadata and then set additional metadata.
  596. dataset.metadata.attribute_types = attribute_types
  597. dataset.metadata.attributes = len( attribute_types )
  598. def set_meta( self, dataset, overwrite = True, **kwd ):
  599. self.set_attribute_metadata( dataset )
  600. i = 0
  601. for i, line in enumerate( file ( dataset.file_name ) ):
  602. line = line.rstrip('\r\n')
  603. if line and not line.startswith( '#' ):
  604. elems = line.split( '\t' )
  605. if len(elems) == 9:
  606. try:
  607. int( elems[3] )
  608. int( elems[4] )
  609. break
  610. except:
  611. pass
  612. Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
  613. def display_peek( self, dataset ):
  614. """Returns formated html of peek"""
  615. return Tabular.make_html_table( self, dataset, column_names=self.column_names )
  616. def get_estimated_display_viewport( self, dataset ):
  617. """
  618. Return a chrom, start, stop tuple for viewing a file. There are slight differences between gff 2 and gff 3
  619. formats. This function should correctly handle both...
  620. """
  621. viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
  622. max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
  623. if self.displayable( dataset ):
  624. try:
  625. seqid = None
  626. start = sys.maxint
  627. stop = 0
  628. fh = open( dataset.file_name )
  629. while True:
  630. line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
  631. if not line: break #EOF
  632. try:
  633. if line.startswith( '##sequence-region' ): # ##sequence-region IV 6000000 6030000
  634. elems = line.rstrip( '\n\r' ).split()
  635. if len( elems ) > 3:
  636. # line looks like:
  637. # ##sequence-region ctg123 1 1497228
  638. seqid = elems[1] # IV
  639. start = int( elems[2] )# 6000000
  640. stop = int( elems[3] ) # 6030000
  641. break #use location declared in file
  642. elif len( elems ) == 2 and elems[1].find( '..' ) > 0:
  643. # line looks like this:
  644. # ##sequence-region X:120000..140000
  645. elems = elems[1].split( ':' )
  646. seqid = elems[0]
  647. start = int( elems[1].split( '..' )[0] )
  648. stop = int( elems[1].split( '..' )[1] )
  649. break #use location declared in file
  650. else:
  651. log.exception( "line (%s) uses an unsupported ##sequence-region definition." % str( line ) )
  652. #break #no break, if bad definition, we try another method
  653. elif line.startswith("browser position"):
  654. # Allow UCSC style browser and track info in the GFF file
  655. pos_info = line.split()[-1]
  656. seqid, startend = pos_info.split(":")
  657. start, stop = map( int, startend.split("-") )
  658. break #use location declared in file
  659. elif True not in map( line.startswith, ( '#', 'track', 'browser' ) ):# line.startswith() does not accept iterator in python2.4
  660. viewport_feature_count -= 1
  661. elems = line.rstrip( '\n\r' ).split( '\t' )
  662. if len( elems ) > 3:
  663. if not seqid:
  664. # We can only set the viewport for a single chromosome
  665. seqid = elems[0]
  666. if seqid == elems[0]:
  667. # Make sure we have not spanned chromosomes
  668. start = min( start, int( elems[3] ) )
  669. stop = max( stop, int( elems[4] ) )
  670. except:
  671. #most likely start/stop is not an int or not enough fields
  672. pass
  673. #make sure we are at the next new line
  674. readline_count = VIEWPORT_MAX_READS_PER_LINE
  675. while line.rstrip( '\n\r' ) == line:
  676. assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
  677. line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
  678. if not line: break #EOF
  679. readline_count -= 1
  680. max_line_count -= 1
  681. if not viewport_feature_count or not max_line_count:
  682. #exceeded viewport or total line count to check
  683. break
  684. if seqid is not None:
  685. return ( seqid, str( start ), str( stop ) ) #Necessary to return strings?
  686. except Exception, e:
  687. #unexpected error
  688. log.exception( str( e ) )
  689. return ( None, None, None ) #could not determine viewport
  690. def ucsc_links( self, dataset, type, app, base_url ):
  691. ret_val = []
  692. seqid, start, stop = self.get_estimated_display_viewport( dataset )
  693. if seqid is not None:
  694. for site_name, site_url in util.get_ucsc_by_build( dataset.dbkey ):
  695. if site_name in app.config.ucsc_display_sites:
  696. redirect_url = urllib.quote_plus(
  697. "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" %
  698. ( site_url, dataset.dbkey, seqid, start, stop ) )
  699. link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url )
  700. ret_val.append( ( site_name, link ) )
  701. return ret_val
  702. def gbrowse_links( self, dataset, type, app, base_url ):
  703. ret_val = []
  704. seqid, start, stop = self.get_estimated_display_viewport( dataset )
  705. if seqid is not None:
  706. for site_name, site_url in util.get_gbrowse_sites_by_build( dataset.dbkey ):
  707. if site_name in app.config.gbrowse_display_sites:
  708. if seqid.startswith( 'chr' ) and len ( seqid ) > 3:
  709. seqid = seqid[3:]
  710. redirect_url = urllib.quote_plus( "%s/?q=%s:%s..%s&eurl=%%s" % ( site_url, seqid, start, stop ) )
  711. link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url )
  712. ret_val.append( ( site_name, link ) )
  713. return ret_val
  714. def sniff( self, filename ):
  715. """
  716. Determines whether the file is in gff format
  717. GFF lines have nine required fields that must be tab-separated.
  718. For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format3
  719. >>> fname = get_test_fname( 'gff_version_3.gff' )
  720. >>> Gff().sniff( fname )
  721. False
  722. >>> fname = get_test_fname( 'test.gff' )
  723. >>> Gff().sniff( fname )
  724. True
  725. """
  726. headers = get_headers( filename, '\t' )
  727. try:
  728. if len(headers) < 2:
  729. return False
  730. for hdr in headers:
  731. if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0:
  732. return False
  733. if hdr and hdr[0] and not hdr[0].startswith( '#' ):
  734. if len(hdr) != 9:
  735. return False
  736. try:
  737. int( hdr[3] )
  738. int( hdr[4] )
  739. except:
  740. return False
  741. if hdr[5] != '.':
  742. try:
  743. score = float( hdr[5] )
  744. except:
  745. return False
  746. if hdr[6] not in data.valid_strand:
  747. return False
  748. return True
  749. except:
  750. return False
  751. # ------------- Dataproviders
  752. # redefine bc super is Tabular
  753. @dataproviders.decorators.dataprovider_factory( 'genomic-region',
  754. dataproviders.dataset.GenomicRegionDataProvider.settings )
  755. def genomic_region_dataprovider( self, dataset, **settings ):
  756. return dataproviders.dataset.GenomicRegionDataProvider( dataset, 0, 3, 4, **settings )
  757. @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
  758. dataproviders.dataset.GenomicRegionDataProvider.settings )
  759. def genomic_region_dict_dataprovider( self, dataset, **settings ):
  760. settings[ 'named_columns' ] = True
  761. return self.genomic_region_dataprovider( dataset, **settings )
  762. @dataproviders.decorators.dataprovider_factory( 'interval',
  763. dataproviders.dataset.IntervalDataProvider.settings )
  764. def interval_dataprovider( self, dataset, **settings ):
  765. return dataproviders.dataset.IntervalDataProvider( dataset, 0, 3, 4, 6, 2, **settings )
  766. @dataproviders.decorators.dataprovider_factory( 'interval-dict',
  767. dataproviders.dataset.IntervalDataProvider.settings )
  768. def interval_dict_dataprovider( self, dataset, **settings ):
  769. settings[ 'named_columns' ] = True
  770. return self.interval_dataprovider( dataset, **settings )
  771. class Gff3( Gff ):
  772. """Tab delimited data in Gff3 format"""
  773. file_ext = "gff3"
  774. valid_gff3_strand = ['+', '-', '.', '?']
  775. valid_gff3_phase = ['.', '0', '1', '2']
  776. column_names = [ 'Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes' ]
  777. track_type = Interval.track_type
  778. """Add metadata elements"""
  779. MetadataElement( name="column_types", default=['str','str','str','int','int','float','str','int','list'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
  780. def __init__(self, **kwd):
  781. """Initialize datatype, by adding GBrowse display app"""
  782. Gff.__init__(self, **kwd)
  783. def set_meta( self, dataset, overwrite = True, **kwd ):
  784. self.set_attribute_metadata( dataset )
  785. i = 0
  786. for i, line in enumerate( file ( dataset.file_name ) ):
  787. line = line.rstrip('\r\n')
  788. if line and not line.startswith( '#' ):
  789. elems = line.split( '\t' )
  790. valid_start = False
  791. valid_end = False
  792. if len( elems ) == 9:
  793. try:
  794. start = int( elems[3] )
  795. valid_start = True
  796. except:
  797. if elems[3] == '.':
  798. valid_start = True
  799. try:
  800. end = int( elems[4] )
  801. valid_end = True
  802. except:
  803. if elems[4] == '.':
  804. valid_end = True
  805. strand = elems[6]
  806. phase = elems[7]
  807. if valid_start and valid_end and start < end and strand in self.valid_gff3_strand and phase in self.valid_gff3_phase:
  808. break
  809. Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
  810. def sniff( self, filename ):
  811. """
  812. Determines whether the file is in gff version 3 format
  813. GFF 3 format:
  814. 1) adds a mechanism for representing more than one level
  815. of hierarchical grouping of features and subfeatures.
  816. 2) separates the ideas of group membership and feature name/id
  817. 3) constrains the feature type field to be taken from a controlled
  818. vocabulary.
  819. 4) allows a single feature, such as an exon, to belong to more than
  820. one group at a time.
  821. 5) provides an explicit convention for pairwise alignments
  822. 6) provides an explicit convention for features that occupy disjunct regions
  823. The format consists of 9 columns, separated by tabs (NOT spaces).
  824. Undefined fields are replaced with the "." character, as described in the original GFF spec.
  825. For complete details see http://song.sourceforge.net/gff3.shtml
  826. >>> fname = get_test_fname( 'test.gff' )
  827. >>> Gff3().sniff( fname )
  828. False
  829. >>> fname = get_test_fname('gff_version_3.gff')
  830. >>> Gff3().sniff( fname )
  831. True
  832. """
  833. headers = get_headers( filename, '\t' )
  834. try:
  835. if len(headers) < 2:
  836. return False
  837. for hdr in headers:
  838. if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) >= 0:
  839. return True
  840. elif hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) < 0:
  841. return False
  842. # Header comments may have been stripped, so inspect the data
  843. if hdr and hdr[0] and not hdr[0].startswith( '#' ):
  844. if len(hdr) != 9:
  845. return False
  846. try:
  847. int( hdr[3] )
  848. except:
  849. if hdr[3] != '.':
  850. return False
  851. try:
  852. int( hdr[4] )
  853. except:
  854. if hdr[4] != '.':
  855. return False
  856. if hdr[5] != '.':
  857. try:
  858. score = float( hdr[5] )
  859. except:
  860. return False
  861. if hdr[6] not in self.valid_gff3_strand:
  862. return False
  863. if hdr[7] not in self.valid_gff3_phase:
  864. return False
  865. return True
  866. except:
  867. return False
  868. class Gtf( Gff ):
  869. """Tab delimited data in Gtf format"""
  870. file_ext = "gtf"
  871. column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Attributes' ]
  872. track_type = Interval.track_type
  873. """Add metadata elements"""
  874. MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False )
  875. MetadataElement( name="column_types", default=['str','str','str','int','int','float','str','int','list'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
  876. def sniff( self, filename ):
  877. """
  878. Determines whether the file is in gtf format
  879. GTF lines have nine required fields that must be tab-separated. The first eight GTF fields are the same as GFF.
  880. The group field has been expanded into a list of attributes. Each attribute consists of a type/value pair.
  881. Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space.
  882. The attribute list must begin with the two mandatory attributes:
  883. gene_id value - A globally unique identifier for the genomic source of the sequence.
  884. transcript_id value - A globally unique identifier for the predicted transcript.
  885. For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format4
  886. >>> fname = get_test_fname( '1.bed' )
  887. >>> Gtf().sniff( fname )
  888. False
  889. >>> fname = get_test_fname( 'test.gff' )
  890. >>> Gtf().sniff( fname )
  891. False
  892. >>> fname = get_test_fname( 'test.gtf' )
  893. >>> Gtf().sniff( fname )
  894. True
  895. """
  896. headers = get_headers( filename, '\t' )
  897. try:
  898. if len(headers) < 2:
  899. return False
  900. for hdr in headers:
  901. if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0:
  902. return False
  903. if hdr and hdr[0] and not hdr[0].startswith( '#' ):
  904. if len(hdr) != 9:
  905. return False
  906. try:
  907. int( hdr[3] )
  908. int( hdr[4] )
  909. except:
  910. return False
  911. if hdr[5] != '.':
  912. try:
  913. score = float( hdr[5] )
  914. except:
  915. return False
  916. if hdr[6] not in data.valid_strand:
  917. return False
  918. # Check attributes for gene_id, transcript_id
  919. attributes = parse_gff_attributes( hdr[8] )
  920. if len( attributes ) >= 2:
  921. if 'gene_id' not in attributes:
  922. return False
  923. if 'transcript_id' not in attributes:
  924. return False
  925. else:
  926. return False
  927. return True
  928. except:
  929. return False
  930. @dataproviders.decorators.has_dataproviders
  931. class Wiggle( Tabular, _RemoteCallMixin ):
  932. """Tab delimited data in wiggle format"""
  933. file_ext = "wig"
  934. track_type = "LineTrack"
  935. data_sources = { "data": "bigwig", "index": "bigwig" }
  936. MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
  937. def __init__( self, **kwd ):
  938. Tabular.__init__( self, **kwd )
  939. self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
  940. self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' )
  941. def get_estimated_display_viewport( self, dataset ):
  942. """Return a chrom, start, stop tuple for viewing a file."""
  943. viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
  944. max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
  945. if self.displayable( dataset ):
  946. try:
  947. chrom = None
  948. start = sys.maxint
  949. end = 0
  950. span = 1
  951. step = None
  952. fh = open( dataset.file_name )
  953. while True:
  954. line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
  955. if not line: break #EOF
  956. try:
  957. if line.startswith( "browser" ):
  958. chr_info = line.rstrip( '\n\r' ).split()[-1]
  959. chrom, coords = chr_info.split( ":" )
  960. start, end = map( int, coords.split( "-" ) )
  961. break # use the browser line
  962. # variableStep chrom=chr20
  963. if line and ( line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" ) ):
  964. if chrom is not None: break #different chrom or different section of the chrom
  965. chrom = line.rstrip( '\n\r' ).split("chrom=")[1].split()[0]
  966. if 'span=' in line:
  967. span = int( line.rstrip( '\n\r' ).split("span=")[1].split()[0] )
  968. if 'step=' in line:
  969. step = int( line.rstrip( '\n\r' ).split("step=")[1].split()[0] )
  970. start = int( line.rstrip( '\n\r' ).split("start=")[1].split()[0] )
  971. else:
  972. fields = line.rstrip( '\n\r' ).split()
  973. if fields:
  974. if step is not None:
  975. if not end:
  976. end = start + span
  977. else:
  978. end += step
  979. else:
  980. start = min( int( fields[0] ), start )
  981. end = max( end, int( fields[0] ) + span )
  982. viewport_feature_count -= 1
  983. except:
  984. pass
  985. #make sure we are at the next new line
  986. readline_count = VIEWPORT_MAX_READS_PER_LINE
  987. while line.rstrip( '\n\r' ) == line:
  988. assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
  989. line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
  990. if not line: break #EOF
  991. readline_count -= 1
  992. max_line_count -= 1
  993. if not viewport_feature_count or not max_line_count:
  994. #exceeded viewport or total line count to check
  995. break
  996. if chrom is not None:
  997. return ( chrom, str( start ), str( end ) ) #Necessary to return strings?
  998. except Exception, e:
  999. #unexpected error
  1000. log.exception( str( e ) )
  1001. return ( None, None, None ) #could not determine viewport
  1002. def gbrowse_links( self, dataset, type, app, base_url ):
  1003. ret_val = []
  1004. chrom, start, stop = self.get_estimated_display_viewport( dataset )
  1005. if chrom is not None:
  1006. for site_name, site_url in util.get_gbrowse_sites_by_build( dataset.dbkey ):
  1007. if site_name in app.config.gbrowse_display_sites:
  1008. if chrom.startswith( 'chr' ) and len ( chrom ) > 3:
  1009. chrom = chrom[3:]
  1010. redirect_url = urllib.quote_plus( "%s/?q=%s:%s..%s&eurl=%%s" % ( site_url, chrom, start, stop ) )
  1011. link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url )
  1012. ret_val.append( ( site_name, link ) )
  1013. return ret_val
  1014. def ucsc_links( self, dataset, type, app, base_url ):
  1015. ret_val = []
  1016. chrom, start, stop = self.get_estimated_display_viewport( dataset )
  1017. if chrom is not None:
  1018. for site_name, site_url in util.get_ucsc_by_build( dataset.dbkey ):
  1019. if site_name in app.config.ucsc_display_sites:
  1020. redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % ( site_url, dataset.dbkey, chrom, start, stop ) )
  1021. link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url )
  1022. ret_val.append( ( site_name, link ) )
  1023. return ret_val
  1024. def display_peek( self, dataset ):
  1025. """Returns formated html of peek"""
  1026. return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] )
  1027. def set_meta( self, dataset, overwrite = True, **kwd ):
  1028. max_data_lines = None
  1029. i = 0
  1030. for i, line in enumerate( file ( dataset.file_name ) ):
  1031. line = line.rstrip('\r\n')
  1032. if line and not line.startswith( '#' ):
  1033. elems = line.split( '\t' )
  1034. try:
  1035. float( elems[0] ) #"Wiggle track data values can be integer or real, positive or negative values"
  1036. break
  1037. except:
  1038. do_break = False
  1039. for col_startswith in data.col1_startswith:
  1040. if elems[0].lower().startswith( col_startswith ):
  1041. do_break = True
  1042. break
  1043. if do_break:
  1044. break
  1045. if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
  1046. #we'll arbitrarily only use the first 100 data lines in this wig file to calculate tabular attributes (column types)
  1047. #this should be sufficient, except when we have mixed wig track types (bed, variable, fixed),
  1048. # but those cases are not a single table that would have consistant column definitions
  1049. #optional metadata values set in Tabular class will be 'None'
  1050. max_data_lines = 100
  1051. Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i, max_data_lines = max_data_lines )
  1052. def sniff( self, filename ):
  1053. """
  1054. Determines wether the file is in wiggle format
  1055. The .wig format is line-oriented. Wiggle data is preceeded by a track definition line,
  1056. which adds a number of options for controlling the default display of this track.
  1057. Following the track definition line is the track data, which can be entered in several
  1058. different formats.
  1059. The track definition line begins with the word 'track' followed by the track type.
  1060. The track type with version is REQUIRED, and it currently must be wiggle_0. For example,
  1061. track type=wiggle_0...
  1062. For complete details see http://genome.ucsc.edu/goldenPath/help/wiggle.html
  1063. >>> fname = get_test_fname( 'interval1.bed' )
  1064. >>> Wiggle().sniff( fname )
  1065. False
  1066. >>> fname = get_test_fname( 'wiggle.wig' )
  1067. >>> Wiggle().sniff( fname )
  1068. True
  1069. """
  1070. headers = get_headers( filename, None )
  1071. try:
  1072. for hdr in headers:
  1073. if len(hdr) > 1 and hdr[0] == 'track' and hdr[1].startswith('type=wiggle'):
  1074. return True
  1075. return False
  1076. except:
  1077. return False
  1078. def get_track_window(self, dataset, data, start, end):
  1079. """
  1080. Assumes we have a numpy file.
  1081. """
  1082. # Maybe if we import here people will still be able to use Galaxy when numpy kills it
  1083. pkg_resources.require("numpy>=1.2.1")
  1084. #from numpy.lib import format
  1085. import numpy
  1086. range = end - start
  1087. # Determine appropriate resolution to plot ~1000 points
  1088. resolution = ( 10 ** math.ceil( math.log10( range / 1000 ) ) )
  1089. # Restrict to valid range
  1090. resolution = min( resolution, 100000 )
  1091. resolution = max( resolution, 1 )
  1092. # Memory map the array (don't load all the data)
  1093. data = numpy.load( data )
  1094. # Grab just what we need
  1095. t_start = math.floor( start / resolution )
  1096. t_end = math.ceil( end / resolution )
  1097. x = numpy.arange( t_start, t_end ) * resolution
  1098. y = data[ t_start : t_end ]
  1099. return zip(x.tolist(), y.tolist())
  1100. def get_track_resolution( self, dataset, start, end):
  1101. range = end - start
  1102. # Determine appropriate resolution to plot ~1000 points
  1103. resolution = math.ceil( 10 ** math.ceil( math.log10( range / 1000 ) ) )
  1104. # Restrict to valid range
  1105. resolution = min( resolution, 100000 )
  1106. resolution = max( resolution, 1 )
  1107. return resolution
  1108. # ------------- Dataproviders
  1109. @dataproviders.decorators.dataprovider_factory( 'wiggle', dataproviders.dataset.WiggleDataProvider.settings )
  1110. def wiggle_dataprovider( self, dataset, **settings ):
  1111. dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
  1112. return dataproviders.dataset.WiggleDataProvider( dataset_source, **settings )
  1113. @dataproviders.decorators.dataprovider_factory( 'wiggle-dict', dataproviders.dataset.WiggleDataProvider.settings )
  1114. def wiggle_dict_dataprovider( self, dataset, **settings ):
  1115. dataset_source = dataproviders.dataset.DatasetDataProvider( dataset )
  1116. settings[ 'named_columns' ] = True
  1117. return dataproviders.dataset.WiggleDataProvider( dataset_source, **settings )
  1118. class CustomTrack ( Tabular ):
  1119. """UCSC CustomTrack"""
  1120. file_ext = "customtrack"
  1121. def __init__(self, **kwd):
  1122. """Initialize interval datatype, by adding UCSC display app"""
  1123. Tabular.__init__(self, **kwd)
  1124. self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
  1125. def set_meta( self, dataset, overwrite = True, **kwd ):
  1126. Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 )
  1127. def display_peek( self, dataset ):
  1128. """Returns formated html of peek"""
  1129. return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] )
  1130. def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ):
  1131. """Return a chrom, start, stop tuple for viewing a file."""
  1132. #FIXME: only BED and WIG custom tracks are currently supported
  1133. #As per previously existing behavior, viewport will only be over the first intervals
  1134. max_line_count = 100 # maximum number of lines to check; includes comment lines
  1135. variable_step_wig = False
  1136. chrom = None
  1137. span = 1
  1138. if self.displayable( dataset ):
  1139. try:
  1140. fh = open( dataset.file_name )
  1141. while True:
  1142. line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
  1143. if not line: break #EOF
  1144. if not line.startswith( '#' ):
  1145. try:
  1146. if variable_step_wig:
  1147. fields = line.rstrip().split()
  1148. if len( fields ) == 2:
  1149. start = int( fields[ 0 ] )
  1150. return ( chrom, str( start ), str( start + span ) )
  1151. elif line and ( line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" ) ):
  1152. chrom = line.rstrip( '\n\r' ).split("chrom=")[1].split()[0]
  1153. if 'span=' in line:
  1154. span = int( line.rstrip( '\n\r' ).split("span=")[1].split()[0] )
  1155. if 'start=' in line:
  1156. start = int( line.rstrip( '\n\r' ).split("start=")[1].split()[0] )
  1157. return ( chrom, str( start ), str( start + span ) )
  1158. else:
  1159. variable_step_wig = True
  1160. else:
  1161. fields = line.rstrip().split( '\t' )
  1162. if len( fields ) >= 3:
  1163. chrom = fields[ 0 ]
  1164. start = int( fields[ 1 ] )
  1165. end = int( fields[ 2 ] )
  1166. return ( chrom, str( start ), str( end ) )
  1167. except Exception:
  1168. #most likely a non-integer field has been encountered for start / stop
  1169. continue
  1170. #make sure we are at the next new line
  1171. readline_count = VIEWPORT_MAX_READS_PER_LINE
  1172. while line.rstrip( '\n\r' ) == line:
  1173. assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
  1174. line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
  1175. if not line: break #EOF
  1176. readline_count -= 1
  1177. max_line_count -= 1
  1178. if not max_line_count:
  1179. #exceeded viewport or total line count to check
  1180. break
  1181. except Exception, e:
  1182. #unexpected error
  1183. log.exception( str( e ) )
  1184. return ( None, None, None ) #could not determine viewport
  1185. def ucsc_links( self, dataset, type, app, base_url ):
  1186. ret_val = []
  1187. chrom, start, stop = self.get_estimated_display_viewport(dataset)
  1188. if chrom is not None:
  1189. for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey):
  1190. if site_name in app.config.ucsc_display_sites:
  1191. internal_url = "%s" % url_for( controller='dataset', dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name )
  1192. display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type) )
  1193. redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop ) )
  1194. link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
  1195. ret_val.append( (site_name, link) )
  1196. return ret_val
  1197. def sniff( self, filename ):
  1198. """
  1199. Determines whether the file is in customtrack format.
  1200. CustomTrack files are built within Galaxy and are basically bed or interval files with the first line looking
  1201. something like this.
  1202. track name="User Track" description="User Supplied Track (from Galaxy)" color=0,0,0 visibility=1
  1203. >>> fname = get_test_fname( 'complete.bed' )
  1204. >>> CustomTrack().sniff( fname )
  1205. False
  1206. >>> fname = get_test_fname( 'ucsc.customtrack' )
  1207. >>> CustomTrack().sniff( fname )
  1208. True
  1209. """
  1210. headers = get_headers( filename, None )
  1211. first_line = True
  1212. for hdr in headers:
  1213. if first_line:
  1214. first_line = False
  1215. try:
  1216. if hdr[0].startswith('track'):
  1217. color_found = False
  1218. visibility_found = False
  1219. for elem in hdr[1:]:
  1220. if elem.startswith('color'): color_found = True
  1221. if elem.startswith('visibility'): visibility_found = True
  1222. if color_found and visibility_found: break
  1223. if not color_found or not visibility_found: return False
  1224. else: return False
  1225. except: return False
  1226. else:
  1227. try:
  1228. if hdr[0] and not hdr[0].startswith( '#' ):
  1229. if len( hdr ) < 3:
  1230. return False
  1231. try:
  1232. int( hdr[1] )
  1233. int( hdr[2] )
  1234. except:
  1235. return False
  1236. except:
  1237. return False
  1238. return True
  1239. class ENCODEPeak( Interval ):
  1240. '''
  1241. Human ENCODE peak format. There are both broad and narrow peak formats.
  1242. Formats are very similar; narrow peak has an additional column, though.
  1243. Broad peak ( http://genome.ucsc.edu/FAQ/FAQformat#format13 ):
  1244. This format is used to provide called regions of signal enrichment based
  1245. on pooled, normalized (interpreted) data. It is a BED 6+3 format.
  1246. Narrow peak http://genome.ucsc.edu/FAQ/FAQformat#format12 and :
  1247. This format is used to provide called peaks of signal enrichment based on
  1248. pooled, normalized (interpreted) data. It is a BED6+4 format.
  1249. '''
  1250. file_ext = "encodepeak"
  1251. column_names = [ 'Chrom', 'Start', 'End', 'Name', 'Score', 'Strand', 'SignalValue', 'pValue', 'qValue', 'Peak' ]
  1252. data_sources = { "data": "tabix", "index": "bigwig" }
  1253. """Add metadata elements"""
  1254. MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
  1255. MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
  1256. MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter )
  1257. MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 )
  1258. MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
  1259. def sniff( self, filename ):
  1260. return False
  1261. class ChromatinInteractions( Interval ):
  1262. '''
  1263. Chromatin interactions obtained from 3C/5C/Hi-C experiments.
  1264. '''
  1265. file_ext = "chrint"
  1266. track_type = "DiagonalHeatmapTrack"
  1267. data_sources = { "data": "tabix", "index": "bigwig" }
  1268. column_names = [ 'Chrom1', 'Start1', 'End1', 'Chrom2', 'Start2', 'End2', 'Value' ]
  1269. """Add metadata elements"""
  1270. MetadataElement( name="chrom1Col", default=1, desc="Chrom1 column", param=metadata.ColumnParameter )
  1271. MetadataElement( name="start1Col", default=2, desc="Start1 column", param=metadata.ColumnParameter )
  1272. MetadataElement( name="end1Col", default=3, desc="End1 column", param=metadata.ColumnParameter )
  1273. MetadataElement( name="chrom2Col", default=4, desc="Chrom2 column", param=metadata.ColumnParameter )
  1274. MetadataElement( name="start2Col", default=5, desc="Start2 column", param=metadata.ColumnParameter )
  1275. MetadataElement( name="end2Col", default=6, desc="End2 column", param=metadata.ColumnParameter )
  1276. MetadataElement( name="valueCol", default=7, desc="Value column", param=metadata.ColumnParameter )
  1277. MetadataElement( name="columns", default=7, desc="Number of columns", readonly=True, visible=False )
  1278. def sniff( self, filename ):
  1279. return False
  1280. if __name__ == '__main__':
  1281. import doctest, sys
  1282. doctest.testmod(sys.modules[__name__])