/Bio/UniGene/UniGene.py

https://github.com/timwintle/biopython · Python · 228 lines · 167 code · 40 blank · 21 comment · 45 complexity · a6ced4d74b0899a1af377dc93dfd9088 MD5 · raw file

  1. # Permission to use, copy, modify, and distribute this software and
  2. # its documentation with or without modifications and for any purpose
  3. # and without fee is hereby granted, provided that any copyright
  4. # notices appear in all copies and that both those copyright notices
  5. # and this permission notice appear in supporting documentation, and
  6. # that the names of the contributors or copyright holders not be used
  7. # in advertising or publicity pertaining to distribution of the software
  8. # without specific prior permission.
  9. #
  10. # THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
  11. # WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
  12. # WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
  13. # CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
  14. # OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
  15. # LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
  16. # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
  17. # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  18. import warnings
  19. warnings.warn("The module Bio.UniGene.UniGene is now obsolete, "
  20. "and will be deprecated and removed in a future "
  21. "release of Biopython. To parse UniGene flat files, "
  22. "please use the parser in Bio.UniGene instead",
  23. PendingDeprecationWarning)
  24. import string
  25. import operator
  26. import urllib
  27. import sgmllib
  28. import UserDict
  29. import Bio.File
  30. class UniGeneParser( sgmllib.SGMLParser ):
  31. def reset( self ):
  32. sgmllib.SGMLParser.reset( self )
  33. self.text = ''
  34. self.queue = UserDict.UserDict()
  35. self.open_tag_stack = []
  36. self.open_tag = 'open_html'
  37. self.key_waiting = ''
  38. self.master_key = ''
  39. self.context = 'general_info'
  40. def parse( self, handle ):
  41. self.reset()
  42. self.feed( handle )
  43. for key in self.queue:
  44. if( self.queue[ key ] == {} ):
  45. if( key[ :15 ] == 'UniGene Cluster' ):
  46. self.queue[ 'UniGene Cluster' ] = key[ 16: ]
  47. del self.queue[ key ]
  48. return self.queue
  49. #
  50. # Assumes an empty line between records
  51. #
  52. def feed( self, handle ):
  53. if isinstance(handle, Bio.File.UndoHandle):
  54. uhandle = handle
  55. else:
  56. uhandle = Bio.File.UndoHandle(handle)
  57. text = ''
  58. while 1:
  59. line = uhandle.readline()
  60. line = string.strip( line )
  61. if( line == '' ):
  62. break
  63. text = text + ' ' + line
  64. sgmllib.SGMLParser.feed( self, text )
  65. def handle_data(self, newtext ):
  66. newtext = string.strip( newtext )
  67. self.text = self.text + newtext
  68. def start_a( self, attrs ):
  69. if( self.context == 'seq_info' ):
  70. if( self.open_tag != 'open_b' ):
  71. self.text = ''
  72. # self.queue.append( attrs )
  73. def end_a( self ):
  74. if( self.context == 'seq_info' ):
  75. if( self.open_tag != 'open_b' ):
  76. if( self.key_waiting == '' ):
  77. self.key_waiting = self.text
  78. self.text = ''
  79. def start_b( self, attrs ):
  80. self.open_tag_stack.append( self.open_tag )
  81. self.open_tag = 'open_b'
  82. if( self.key_waiting == '' ):
  83. self.text = ''
  84. def end_b( self ):
  85. if( self.text[ :15 ] == 'UniGene Cluster' ):
  86. self.queue[ 'UniGene Cluster' ] = self.text[ 16: ]
  87. self.text = ''
  88. elif( self.key_waiting == '' ):
  89. self.extract_key()
  90. def extract_key( self ):
  91. text = string.strip( self.text )
  92. key = string.join( string.split( text ) )
  93. words = string.split( key )
  94. key = string.join( words[ :2 ] )
  95. self.text = ''
  96. try:
  97. self.open_tag = self.open_tag_stack.pop()
  98. except:
  99. self.open_tag = 'open_html'
  100. if( self.open_tag == 'open_table_data' ):
  101. if( self.context == 'general_info' ):
  102. if( self.key_waiting == '' ):
  103. self.key_waiting = key
  104. self.text = ''
  105. elif( self.context == 'seq_info' ):
  106. if( text == 'Key to Symbols' ):
  107. self.context = 'legend'
  108. self.master_key = key
  109. elif( self.context == 'general_info' ):
  110. self.master_key = key
  111. if( string.find( key, 'SEQUENCE' ) != -1 ):
  112. self.context = 'seq_info'
  113. self.queue[ key ] = UserDict.UserDict()
  114. elif( self.context == 'seq_info' ):
  115. self.queue[ key ] = UserDict.UserDict()
  116. self.master_key = key
  117. def start_table( self, attrs ):
  118. self.open_tag_stack.append( self.open_tag )
  119. self.open_tag = 'open_table'
  120. def end_table( self ):
  121. try:
  122. self.open_tag = self.open_tag_stack.pop()
  123. except:
  124. self.open_tag = 'open_html'
  125. self.key_waiting = ''
  126. def start_tr( self, attrs ):
  127. self.open_tag_stack.append( self.open_tag )
  128. self.open_tag = 'open_table_row'
  129. self.text = ''
  130. def end_tr( self ):
  131. try:
  132. self.open_tag = self.open_tag_stack.pop()
  133. except:
  134. self.open_tag = 'open_html'
  135. text = self.text
  136. if text:
  137. self.text = ''
  138. if( text[ 0 ] == ':' ):
  139. text = text[ 1: ]
  140. text = string.join( string.split( text ) )
  141. if( ( self.context == 'general_info' ) or \
  142. ( self.context == 'seq_info' ) ):
  143. try:
  144. contents = self.queue[ self.master_key ][ self.key_waiting ]
  145. if( type( contents ) == type( [] ) ):
  146. contents.append( text )
  147. else:
  148. self.queue[ self.master_key ][ self.key_waiting ] = \
  149. [ contents , text ]
  150. except:
  151. self.queue[ self.master_key ][ self.key_waiting ] = text
  152. self.key_waiting = ''
  153. def start_td( self, attrs ):
  154. self.open_tag_stack.append( self.open_tag )
  155. self.open_tag = 'open_table_data'
  156. def end_td( self ):
  157. try:
  158. self.open_tag = self.open_tag_stack.pop()
  159. except:
  160. self.open_tag = 'open_html'
  161. if( self.context == 'seq_info' ):
  162. self.text = self.text + ' '
  163. def print_item( self, item, level = 1 ):
  164. indent = ' '
  165. for j in range( 0, level ):
  166. indent = indent + ' '
  167. if( type( item ) == type( '' ) ):
  168. if( item != '' ):
  169. print '%s%s' % ( indent, item )
  170. elif( type( item ) == type([])):
  171. for subitem in item:
  172. self.print_item( subitem, level + 1 )
  173. elif( isinstance( item, UserDict.UserDict ) ):
  174. for subitem in item:
  175. print '%skey is %s' % ( indent, subitem )
  176. self.print_item( item[ subitem ], level + 1 )
  177. else:
  178. print item
  179. def print_tags( self ):
  180. for key in self.queue:
  181. print 'key %s' % key
  182. self.print_item( self.queue[ key ] )
  183. if( __name__ == '__main__' ):
  184. handle = open( 'Hs13225.htm')
  185. undo_handle = Bio.File.UndoHandle( handle )
  186. unigene_parser = UniGeneParser()
  187. unigene_parser.parse( handle )
  188. unigene_parser.print_tags()