/scripts/microbes/util.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 224 lines · 188 code · 14 blank · 22 comment · 63 complexity · 675d1595ca855dd936b404f21dc6d141 MD5 · raw file

  1. #!/usr/bin/env python
  2. #Dan Blankenberg
  3. import sys
  4. assert sys.version_info[:2] >= ( 2, 4 )
  5. #genbank_to_bed
  6. class Region:
  7. def __init__( self ):
  8. self.qualifiers = {}
  9. self.start = None
  10. self.end = None
  11. self.strand = '+'
  12. def set_coordinates_by_location( self, location ):
  13. location = location.strip().lower().replace( '..', ',' )
  14. if "complement(" in location: #if part of the sequence is on the negative strand, it all is?
  15. self.strand = '-' #default of + strand
  16. for remove_text in ["join(", "order(", "complement(", ")"]:
  17. location = location.replace( remove_text, "" )
  18. for number in location.split( ',' ):
  19. number = number.strip('\n\r\t <>,()')
  20. if number:
  21. if "^" in number:
  22. #a single point
  23. #check that this is correct for points, ie: 413/NC_005027.gbk: misc_feature 6636286^6636287 ===> 6636285,6636286
  24. end = int( number.split( '^' )[0] )
  25. start = end - 1
  26. else:
  27. end = int( number )
  28. start = end - 1 #match BED coordinates
  29. if self.start is None or start < self.start:
  30. self.start = start
  31. if self.end is None or end > self.end:
  32. self.end = end
  33. class GenBankFeatureParser:
  34. """Parses Features from Single Locus GenBank file"""
  35. def __init__( self, fh, features_list = [] ):
  36. self.fh = fh
  37. self.features = {}
  38. fh.seek(0)
  39. in_features = False
  40. last_feature_name = None
  41. base_indent = 0
  42. last_indent = 0
  43. last_attr_name = None
  44. for line in fh:
  45. if not in_features and line.startswith('FEATURES'):
  46. in_features = True
  47. continue
  48. if in_features:
  49. lstrip = line.lstrip()
  50. if line and lstrip == line:
  51. break #end of feature block
  52. cur_indent = len( line ) - len( lstrip )
  53. if last_feature_name is None:
  54. base_indent = cur_indent
  55. if cur_indent == base_indent:
  56. #a new feature
  57. last_attr_name = None
  58. fields = lstrip.split( None, 1 )
  59. last_feature_name = fields[0].strip()
  60. if not features_list or ( features_list and last_feature_name in features_list ):
  61. if last_feature_name not in self.features:
  62. self.features[last_feature_name] = []
  63. region = Region()
  64. region.set_coordinates_by_location( fields[1] )
  65. self.features[last_feature_name].append( region )
  66. else:
  67. #add info to last known feature
  68. line = line.strip()
  69. if line.startswith( '/' ):
  70. fields = line[1:].split( '=', 1 )
  71. if len( fields ) == 2:
  72. last_attr_name, content = fields
  73. else:
  74. #No data
  75. last_attr_name = line[1:]
  76. content = ""
  77. content = content.strip( '"' )
  78. if last_attr_name not in self.features[last_feature_name][-1].qualifiers:
  79. self.features[last_feature_name][-1].qualifiers[last_attr_name] = []
  80. self.features[last_feature_name][-1].qualifiers[last_attr_name].append( content )
  81. elif last_attr_name is None and last_feature_name:
  82. # must still be working on location
  83. self.features[last_feature_name][-1].set_coordinates_by_location( line )
  84. else:
  85. #continuation of multi-line qualifier content
  86. if last_feature_name.lower() in ['translation']:
  87. self.features[last_feature_name][-1].qualifiers[last_attr_name][-1] = "%s%s" % ( self.features[last_feature_name][-1].qualifiers[last_attr_name][-1], line.rstrip( '"' ) )
  88. else:
  89. self.features[last_feature_name][-1].qualifiers[last_attr_name][-1] = "%s %s" % ( self.features[last_feature_name][-1].qualifiers[last_attr_name][-1], line.rstrip( '"' ) )
  90. def get_features_by_type( self, feature_type ):
  91. if feature_type not in self.features:
  92. return []
  93. else:
  94. return self.features[feature_type]
  95. # Parse A GenBank file and return arrays of BED regions for the corresponding features
  96. def get_bed_from_genbank(gb_file, chrom, feature_list):
  97. genbank_parser = GenBankFeatureParser( open( gb_file ) )
  98. features = {}
  99. for feature_type in feature_list:
  100. features[feature_type]=[]
  101. for feature in genbank_parser.get_features_by_type( feature_type ):
  102. name = ""
  103. for name_tag in ['gene', 'locus_tag', 'db_xref']:
  104. if name_tag in feature.qualifiers:
  105. if name: name = name + ";"
  106. name = name + feature.qualifiers[name_tag][0].replace(" ","_")
  107. if not name:
  108. name = "unknown"
  109. features[feature_type].append( "%s\t%s\t%s\t%s\t%s\t%s" % ( chrom, feature.start, feature.end, name, 0, feature.strand ) )#append new bed field here
  110. return features
  111. #geneMark to bed
  112. import sys
  113. #converts GeneMarkHMM to bed
  114. #returns an array of bed regions
  115. def get_bed_from_GeneMark(geneMark_filename, chr):
  116. orfs = open(geneMark_filename).readlines()
  117. while True:
  118. line = orfs.pop(0).strip()
  119. if line.startswith("--------"):
  120. orfs.pop(0)
  121. break
  122. orfs = "".join(orfs)
  123. ctr = 0
  124. regions = []
  125. for block in orfs.split("\n\n"):
  126. if block.startswith("List of Regions of interest"): break
  127. best_block = {'start':0,'end':0,'strand':'+','avg_prob':-sys.maxint,'start_prob':-sys.maxint,'name':'DNE'}
  128. ctr+=1
  129. ctr2=0
  130. for line in block.split("\n"):
  131. ctr2+=1
  132. fields = line.split()
  133. start = int(fields.pop(0))-1
  134. end = int(fields.pop(0))
  135. strand = fields.pop(0)
  136. if strand == 'complement':
  137. strand = "-"
  138. else:
  139. strand = "+"
  140. frame = fields.pop(0)
  141. frame = frame + " " + fields.pop(0)
  142. avg_prob = float(fields.pop(0))
  143. try:
  144. start_prob = float(fields.pop(0))
  145. except:
  146. start_prob = 0
  147. name = "orf_"+str(ctr)+"_"+str(ctr2)
  148. if avg_prob >= best_block['avg_prob']:
  149. if start_prob > best_block['start_prob']:
  150. best_block = {'start':start,'end':end,'strand':strand,'avg_prob':avg_prob,'start_prob':start_prob,'name':name}
  151. regions.append(chr+"\t"+str(best_block['start'])+"\t"+str(best_block['end'])+"\t"+best_block['name']+"\t"+str(int(best_block['avg_prob']*1000))+"\t"+best_block['strand'])
  152. return regions
  153. #geneMarkHMM to bed
  154. #converts GeneMarkHMM to bed
  155. #returns an array of bed regions
  156. def get_bed_from_GeneMarkHMM(geneMarkHMM_filename, chr):
  157. orfs = open(geneMarkHMM_filename).readlines()
  158. while True:
  159. line = orfs.pop(0).strip()
  160. if line == "Predicted genes":
  161. orfs.pop(0)
  162. orfs.pop(0)
  163. break
  164. regions = []
  165. for line in orfs:
  166. fields = line.split()
  167. name = "gene_number_"+fields.pop(0)
  168. strand = fields.pop(0)
  169. start = fields.pop(0)
  170. if start.startswith("<"): start = 1
  171. start = int(start)-1
  172. end = fields.pop(0)
  173. if end.startswith(">"): end = end[1:]
  174. end = int(end)
  175. score = 0 # no scores provided
  176. regions.append(chr+"\t"+str(start)+"\t"+str(end)+"\t"+name+"\t"+str(score)+"\t"+strand)
  177. return regions
  178. #glimmer3 to bed
  179. #converts glimmer3 to bed, doing some linear scaling (probably not correct?) on scores
  180. #returns an array of bed regions
  181. def get_bed_from_glimmer3(glimmer3_filename, chr):
  182. max_score = -sys.maxint
  183. min_score = sys.maxint
  184. orfs = []
  185. for line in open(glimmer3_filename).readlines():
  186. if line.startswith(">"): continue
  187. fields = line.split()
  188. name = fields.pop(0)
  189. start = int(fields.pop(0))
  190. end = int(fields.pop(0))
  191. if int(fields.pop(0))<0:
  192. strand = "-"
  193. temp = start
  194. start = end
  195. end = temp
  196. else:
  197. strand = "+"
  198. start = start - 1
  199. score = (float(fields.pop(0)))
  200. if score > max_score: max_score = score
  201. if score < min_score: min_score = score
  202. orfs.append((chr,start,end,name,score,strand))
  203. delta = 0
  204. if min_score < 0: delta = min_score * -1
  205. regions = []
  206. for (chr,start,end,name,score,strand) in orfs:
  207. #need to cast to str because was having the case where 1000.0 was rounded to 999 by int, some sort of precision bug?
  208. my_score = int(float(str( ( (score+delta) * (1000-0-(min_score+delta)) ) / ( (max_score + delta) + 0 ))))
  209. regions.append(chr+"\t"+str(start)+"\t"+str(end)+"\t"+name+"\t"+str(my_score)+"\t"+strand)
  210. return regions