/lib/galaxy/datatypes/util/gff_util.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 431 lines · 373 code · 14 blank · 44 comment · 20 complexity · 5db39e67f201339020b7afc4e14b789a MD5 · raw file

  1. """
  2. Provides utilities for working with GFF files.
  3. """
  4. import copy
  5. import pkg_resources; pkg_resources.require( "bx-python" )
  6. from bx.intervals.io import *
  7. from bx.tabular.io import Header, Comment
  8. from galaxy.util.odict import odict
  9. class GFFInterval( GenomicInterval ):
  10. """
  11. A GFF interval, including attributes. If file is strictly a GFF file,
  12. only attribute is 'group.'
  13. """
  14. def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
  15. strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
  16. # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
  17. # temporarily set strand and then unset after initing GenomicInterval.
  18. unknown_strand = False
  19. if not fix_strand and fields[ strand_col ] == '.':
  20. unknown_strand = True
  21. fields[ strand_col ] = '+'
  22. GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
  23. default_strand, fix_strand=fix_strand )
  24. if unknown_strand:
  25. self.strand = '.'
  26. self.fields[ strand_col ] = '.'
  27. # Handle feature, score column.
  28. self.feature_col = feature_col
  29. if self.feature_col >= self.nfields:
  30. raise MissingFieldError( "No field for feature_col (%d)" % feature_col )
  31. self.feature = self.fields[ self.feature_col ]
  32. self.score_col = score_col
  33. if self.score_col >= self.nfields:
  34. raise MissingFieldError( "No field for score_col (%d)" % score_col )
  35. self.score = self.fields[ self.score_col ]
  36. # GFF attributes.
  37. self.attributes = parse_gff_attributes( fields[8] )
  38. def copy( self ):
  39. return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
  40. self.end_col, self.strand_col, self.score_col, self.strand)
  41. class GFFFeature( GFFInterval ):
  42. """
  43. A GFF feature, which can include multiple intervals.
  44. """
  45. def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
  46. strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \
  47. raw_size=0 ):
  48. # Use copy so that first interval and feature do not share fields.
  49. GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \
  50. start_col, end_col, strand_col, score_col, default_strand, \
  51. fix_strand=fix_strand )
  52. self.intervals = intervals
  53. self.raw_size = raw_size
  54. # Use intervals to set feature attributes.
  55. for interval in self.intervals:
  56. # Error checking. NOTE: intervals need not share the same strand.
  57. if interval.chrom != self.chrom:
  58. raise ValueError( "interval chrom does not match self chrom: %s != %s" % \
  59. ( interval.chrom, self.chrom ) )
  60. # Set start, end of interval.
  61. if interval.start < self.start:
  62. self.start = interval.start
  63. if interval.end > self.end:
  64. self.end = interval.end
  65. def name( self ):
  66. """ Returns feature's name. """
  67. name = None
  68. # Preference for name: GTF, GFF3, GFF.
  69. for attr_name in [
  70. # GTF:
  71. 'gene_id', 'transcript_id',
  72. # GFF3:
  73. 'ID', 'id',
  74. # GFF (TODO):
  75. 'group' ]:
  76. name = self.attributes.get( attr_name, None )
  77. if name is not None:
  78. break
  79. return name
  80. def copy( self ):
  81. intervals_copy = []
  82. for interval in self.intervals:
  83. intervals_copy.append( interval.copy() )
  84. return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col,
  85. self.score_col, self.strand, intervals=intervals_copy )
  86. def lines( self ):
  87. lines = []
  88. for interval in self.intervals:
  89. lines.append( '\t'.join( interval.fields ) )
  90. return lines
  91. class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
  92. """
  93. Reader wrapper that reads GFF intervals/lines and automatically converts
  94. them to BED format.
  95. """
  96. def parse_row( self, line ):
  97. # HACK: this should return a GFF interval, but bx-python operations
  98. # require GenomicInterval objects and subclasses will not work.
  99. interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
  100. self.end_col, self.strand_col, self.default_strand, \
  101. fix_strand=self.fix_strand )
  102. interval = convert_gff_coords_to_bed( interval )
  103. return interval
  104. class GFFReaderWrapper( NiceReaderWrapper ):
  105. """
  106. Reader wrapper for GFF files.
  107. Wrapper has two major functions:
  108. 1. group entries for GFF file (via group column), GFF3 (via id attribute),
  109. or GTF (via gene_id/transcript id);
  110. 2. convert coordinates from GFF format--starting and ending coordinates
  111. are 1-based, closed--to the 'traditional'/BED interval format--0 based,
  112. half-open. This is useful when using GFF files as inputs to tools that
  113. expect traditional interval format.
  114. """
  115. def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \
  116. end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
  117. NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \
  118. strand_col=strand_col, fix_strand=fix_strand, **kwargs )
  119. self.feature_col = feature_col
  120. self.score_col = score_col
  121. self.convert_to_bed_coord = convert_to_bed_coord
  122. self.last_line = None
  123. self.cur_offset = 0
  124. self.seed_interval = None
  125. self.seed_interval_line_len = 0
  126. def parse_row( self, line ):
  127. interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \
  128. self.start_col, self.end_col, self.strand_col, self.score_col, \
  129. self.default_strand, fix_strand=self.fix_strand )
  130. return interval
  131. def next( self ):
  132. """ Returns next GFFFeature. """
  133. #
  134. # Helper function.
  135. #
  136. def handle_parse_error( parse_error ):
  137. """ Actions to take when ParseError found. """
  138. if self.outstream:
  139. if self.print_delegate and hasattr(self.print_delegate,"__call__"):
  140. self.print_delegate( self.outstream, e, self )
  141. self.skipped += 1
  142. # no reason to stuff an entire bad file into memmory
  143. if self.skipped < 10:
  144. self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
  145. # For debugging, uncomment this to propogate parsing exceptions up.
  146. # I.e. the underlying reason for an unexpected StopIteration exception
  147. # can be found by uncommenting this.
  148. # raise e
  149. #
  150. # Get next GFFFeature
  151. #
  152. raw_size = self.seed_interval_line_len
  153. # If there is no seed interval, set one. Also, if there are no more
  154. # intervals to read, this is where iterator dies.
  155. if not self.seed_interval:
  156. while not self.seed_interval:
  157. try:
  158. self.seed_interval = GenomicIntervalReader.next( self )
  159. except ParseError, e:
  160. handle_parse_error( e )
  161. # TODO: When no longer supporting python 2.4 use finally:
  162. #finally:
  163. raw_size += len( self.current_line )
  164. # If header or comment, clear seed interval and return it with its size.
  165. if isinstance( self.seed_interval, ( Header, Comment ) ):
  166. return_val = self.seed_interval
  167. return_val.raw_size = len( self.current_line )
  168. self.seed_interval = None
  169. self.seed_interval_line_len = 0
  170. return return_val
  171. # Initialize feature identifier from seed.
  172. feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
  173. # For GFF3
  174. feature_id = self.seed_interval.attributes.get( 'ID', None )
  175. feature_parent_id = self.seed_interval.attributes.get( 'Parent', None )
  176. # For GTF.
  177. feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None )
  178. feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
  179. # Read all intervals associated with seed.
  180. feature_intervals = []
  181. feature_intervals.append( self.seed_interval )
  182. while True:
  183. try:
  184. interval = GenomicIntervalReader.next( self )
  185. raw_size += len( self.current_line )
  186. except StopIteration, e:
  187. # No more intervals to read, but last feature needs to be
  188. # returned.
  189. interval = None
  190. raw_size += len( self.current_line )
  191. break
  192. except ParseError, e:
  193. handle_parse_error( e )
  194. raw_size += len( self.current_line )
  195. continue
  196. # TODO: When no longer supporting python 2.4 use finally:
  197. #finally:
  198. #raw_size += len( self.current_line )
  199. # Ignore comments.
  200. if isinstance( interval, Comment ):
  201. continue
  202. # Determine if interval is part of feature.
  203. part_of = False
  204. group = interval.attributes.get( 'group', None )
  205. # GFF test:
  206. if group and feature_group == group:
  207. part_of = True
  208. # GFF3 test:
  209. parent_id = interval.attributes.get( 'Parent', None )
  210. cur_id = interval.attributes.get( 'ID', None )
  211. if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ):
  212. part_of = True
  213. # GTF test:
  214. transcript_id = interval.attributes.get( 'transcript_id', None )
  215. if transcript_id and transcript_id == feature_transcript_id:
  216. part_of = True
  217. # If interval is not part of feature, clean up and break.
  218. if not part_of:
  219. # Adjust raw size because current line is not part of feature.
  220. raw_size -= len( self.current_line )
  221. break
  222. # Interval associated with feature.
  223. feature_intervals.append( interval )
  224. # Last interval read is the seed for the next interval.
  225. self.seed_interval = interval
  226. self.seed_interval_line_len = len( self.current_line )
  227. # Return feature.
  228. feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \
  229. self.end_col, self.strand_col, self.score_col, \
  230. self.default_strand, fix_strand=self.fix_strand, \
  231. intervals=feature_intervals, raw_size=raw_size )
  232. # Convert to BED coords?
  233. if self.convert_to_bed_coord:
  234. convert_gff_coords_to_bed( feature )
  235. return feature
  236. def convert_bed_coords_to_gff( interval ):
  237. """
  238. Converts an interval object's coordinates from BED format to GFF format.
  239. Accepted object types include GenomicInterval and list (where the first
  240. element in the list is the interval's start, and the second element is
  241. the interval's end).
  242. """
  243. if isinstance( interval, GenomicInterval ):
  244. interval.start += 1
  245. if isinstance( interval, GFFFeature ):
  246. for subinterval in interval.intervals:
  247. convert_bed_coords_to_gff( subinterval )
  248. elif type ( interval ) is list:
  249. interval[ 0 ] += 1
  250. return interval
  251. def convert_gff_coords_to_bed( interval ):
  252. """
  253. Converts an interval object's coordinates from GFF format to BED format.
  254. Accepted object types include GFFFeature, GenomicInterval, and list (where
  255. the first element in the list is the interval's start, and the second
  256. element is the interval's end).
  257. """
  258. if isinstance( interval, GenomicInterval ):
  259. interval.start -= 1
  260. if isinstance( interval, GFFFeature ):
  261. for subinterval in interval.intervals:
  262. convert_gff_coords_to_bed( subinterval )
  263. elif type ( interval ) is list:
  264. interval[ 0 ] -= 1
  265. return interval
  266. def parse_gff_attributes( attr_str ):
  267. """
  268. Parses a GFF/GTF attribute string and returns a dictionary of name-value
  269. pairs. The general format for a GFF3 attributes string is
  270. name1=value1;name2=value2
  271. The general format for a GTF attribute string is
  272. name1 "value1" ; name2 "value2"
  273. The general format for a GFF attribute string is a single string that
  274. denotes the interval's group; in this case, method returns a dictionary
  275. with a single key-value pair, and key name is 'group'
  276. """
  277. attributes_list = attr_str.split(";")
  278. attributes = {}
  279. for name_value_pair in attributes_list:
  280. # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
  281. # attribute; next, try double quotes for GTF.
  282. pair = name_value_pair.strip().split("=")
  283. if len( pair ) == 1:
  284. pair = name_value_pair.strip().split("\"")
  285. if len( pair ) == 1:
  286. # Could not split for some reason -- raise exception?
  287. continue
  288. if pair == '':
  289. continue
  290. name = pair[0].strip()
  291. if name == '':
  292. continue
  293. # Need to strip double quote from values
  294. value = pair[1].strip(" \"")
  295. attributes[ name ] = value
  296. if len( attributes ) == 0:
  297. # Could not split attributes string, so entire string must be
  298. # 'group' attribute. This is the case for strictly GFF files.
  299. attributes['group'] = attr_str
  300. return attributes
  301. def gff_attributes_to_str( attrs, gff_format ):
  302. """
  303. Convert GFF attributes to string. Supported formats are GFF3, GTF.
  304. """
  305. if gff_format == 'GTF':
  306. format_string = '%s "%s"'
  307. # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
  308. id_attr = None
  309. if 'group' in attrs:
  310. id_attr = 'group'
  311. elif 'ID' in attrs:
  312. id_attr = 'ID'
  313. elif 'Parent' in attrs:
  314. id_attr = 'Parent'
  315. if id_attr:
  316. attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
  317. elif gff_format == 'GFF3':
  318. format_string = '%s=%s'
  319. attrs_strs = []
  320. for name, value in attrs.items():
  321. attrs_strs.append( format_string % ( name, value ) )
  322. return " ; ".join( attrs_strs )
  323. def read_unordered_gtf( iterator, strict=False ):
  324. """
  325. Returns GTF features found in an iterator. GTF lines need not be ordered
  326. or clustered for reader to work. Reader returns GFFFeature objects sorted
  327. by transcript_id, chrom, and start position.
  328. """
  329. # -- Get function that generates line/feature key. --
  330. get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ]
  331. if strict:
  332. # Strict GTF parsing uses transcript_id only to group lines into feature.
  333. key_fn = get_transcript_id
  334. else:
  335. # Use lenient parsing where chromosome + transcript_id is the key. This allows
  336. # transcripts with same ID on different chromosomes; this occurs in some popular
  337. # datasources, such as RefGenes in UCSC.
  338. key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )
  339. # Aggregate intervals by transcript_id and collect comments.
  340. feature_intervals = odict()
  341. comments = []
  342. for count, line in enumerate( iterator ):
  343. if line.startswith( '#' ):
  344. comments.append( Comment( line ) )
  345. continue
  346. line_key = key_fn( line.split('\t') )
  347. if line_key in feature_intervals:
  348. feature = feature_intervals[ line_key ]
  349. else:
  350. feature = []
  351. feature_intervals[ line_key ] = feature
  352. feature.append( GFFInterval( None, line.split( '\t' ) ) )
  353. # Create features.
  354. chroms_features = {}
  355. for count, intervals in enumerate( feature_intervals.values() ):
  356. # Sort intervals by start position.
  357. intervals.sort( lambda a,b: cmp( a.start, b.start ) )
  358. feature = GFFFeature( None, intervals=intervals )
  359. if feature.chrom not in chroms_features:
  360. chroms_features[ feature.chrom ] = []
  361. chroms_features[ feature.chrom ].append( feature )
  362. # Sort features by chrom, start position.
  363. chroms_features_sorted = []
  364. for chrom_features in chroms_features.values():
  365. chroms_features_sorted.append( chrom_features )
  366. chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) )
  367. for features in chroms_features_sorted:
  368. features.sort( lambda a,b: cmp( a.start, b.start ) )
  369. # Yield comments first, then features.
  370. # FIXME: comments can appear anywhere in file, not just the beginning.
  371. # Ideally, then comments would be associated with features and output
  372. # just before feature/line.
  373. for comment in comments:
  374. yield comment
  375. for chrom_features in chroms_features_sorted:
  376. for feature in chrom_features:
  377. yield feature