PageRenderTime 29ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 1ms

/Bio/FSSP/__init__.py

https://gitlab.com/18runt88/biopython
Python | 281 lines | 257 code | 8 blank | 16 comment | 0 complexity | a293bf844b63600f47074853252113d6 MD5 | raw file
  1. # This code is part of the Biopython distribution and governed by its
  2. # license. Please see the LICENSE file that should have been included
  3. # as part of this package.
  4. #
  5. """Parser for FSSP files, used in a database of protein fold classifications.
  6. This is a module to handle FSSP files. For now it parses only the header,
  7. summary and alignment sections.
  8. See: Holm and Sander (1996) The FSSP database: fold classification based on
  9. structure-structure alignment of proteins.
  10. functions: read_fssp(file_handle): reads an fssp file into the records. Returns a
  11. tuple of two instances.
  12. mult_align: returns a Biopython alignment object
  13. """
  14. from __future__ import print_function
  15. import re
  16. from . import fssp_rec
  17. from Bio.Align import Generic
  18. from Bio import Alphabet
  19. __docformat__ = "restructuredtext en"
  20. fff_rec = fssp_rec.fff_rec
  21. header_records = {
  22. 'database': re.compile('^DATABASE'),
  23. 'pdbid': re.compile('^PDBID'),
  24. 'header': re.compile('^HEADER'),
  25. 'compnd': re.compile('^COMPND'),
  26. 'author': re.compile('^AUTHOR'),
  27. 'source': re.compile('^SOURCE'),
  28. 'seqlength': re.compile('^SEQLENGTH'),
  29. 'nalign': re.compile('^NALIGN')
  30. }
  31. summary_title = re.compile('## +SUMMARY')
  32. summary_rec = re.compile(' *[0-9]+: +[1-9][0-9a-z]{3,3}')
  33. alignments_title = re.compile('## +ALIGNMENTS')
  34. alignments_rec = re.compile(' *[0-9]+ +-{0,1}[0-9]+')
  35. equiv_title = re.compile('## +EQUIVALENCES')
  36. class FSSPHeader(object):
  37. def __init__(self):
  38. self.database = None
  39. self.pdbid = ''
  40. self.header = ''
  41. self.compnd = ''
  42. self.source = ''
  43. self.author = []
  44. self.seqlength = 0
  45. self.nalign = 0
  46. def fill_header(self, inline):
  47. for i in header_records:
  48. if header_records[i].match(inline):
  49. if i == 'database' or i == 'seqlength' or i == 'nalign':
  50. setattr(self, i, int(inline.split()[1]))
  51. elif i == 'compnd' or i == 'author':
  52. setattr(self, i, inline.split()[1:])
  53. elif i == 'source' or i == 'header':
  54. attr = inline[inline.find(' ') + 1:].strip()
  55. setattr(self, i, attr)
  56. else:
  57. setattr(self, i, inline.split()[1])
  58. class PosAlign(object):
  59. def __init__(self, inStr):
  60. inStr = inStr.strip()
  61. if len(inStr) != 1 and len(inStr) != 2:
  62. raise ValueError('PosAlign: length not 2 chars' + inStr)
  63. if inStr == '..':
  64. self.aa = '-'
  65. self.gap = 1
  66. else:
  67. self.gap = 0
  68. self.aa = inStr[0]
  69. if self.aa == self.aa.lower():
  70. self.aa = 'C'
  71. if len(inStr) == 2:
  72. self.ss = inStr[1].upper()
  73. else:
  74. self.ss = '0'
  75. def __repr__(self):
  76. if self.gap:
  77. outstring = '..'
  78. else:
  79. outstring = self.aa + self.ss.lower()
  80. return outstring
  81. __str__ = __repr__
  82. class FSSPSumRec(object):
  83. """ Contains info from an FSSP summary record"""
  84. def __init__(self, in_str):
  85. self.raw = in_str
  86. in_rec = in_str.strip().split()
  87. # print(in_rec)
  88. self.nr = int(in_rec[0][:-1])
  89. self.pdb1 = in_rec[1][:4]
  90. if len(in_rec[1]) == 4:
  91. self.chain1 = '0'
  92. elif len(in_rec[1]) == 5:
  93. self.chain1 = in_rec[1][4]
  94. else:
  95. raise ValueError('Bad PDB ID 1')
  96. self.pdb2 = in_rec[2][:4]
  97. if len(in_rec[2]) == 4:
  98. self.chain2 = '0'
  99. elif len(in_rec[2]) == 5:
  100. self.chain2 = in_rec[2][4]
  101. else:
  102. raise ValueError('Bad PDB ID 2')
  103. self.zscore = float(in_rec[3])
  104. self.rmsd = float(in_rec[4])
  105. self.lali = float(in_rec[5])
  106. self.lseq2 = float(in_rec[6])
  107. self.pID = float(in_rec[7])
  108. self.revers = int(in_rec[8])
  109. self.permut = int(in_rec[9])
  110. self.nfrag = int(in_rec[10])
  111. self.topo = in_rec[11]
  112. self.doc = ''
  113. for i in in_rec[12:]:
  114. self.doc = self.doc + i + ' '
  115. self.doc = self.doc.rstrip() + '\n'
  116. def __repr__(self):
  117. return self.raw
  118. __str__ = __repr__
  119. class FSSPAlignRec(object):
  120. def __init__(self, in_fff_rec):
  121. # print(in_fff_rec)
  122. self.abs_res_num = int(in_fff_rec[fssp_rec.align.abs_res_num])
  123. self.pdb_res_num = in_fff_rec[fssp_rec.align.pdb_res_num].strip()
  124. self.chain_id = in_fff_rec[fssp_rec.align.chain_id]
  125. if self.chain_id == ' ':
  126. self.chain_id = '0'
  127. self.res_name = in_fff_rec[fssp_rec.align.res_name]
  128. if self.res_name == self.res_name.lower():
  129. self.res_name = 'C'
  130. self.ss1 = in_fff_rec[fssp_rec.align.ss1]
  131. self.turn3 = in_fff_rec[fssp_rec.align.turn3]
  132. self.turn4 = in_fff_rec[fssp_rec.align.turn4]
  133. self.turn5 = in_fff_rec[fssp_rec.align.turn5]
  134. self.pos_align_dict = {}
  135. self.PosAlignList = []
  136. def add_align_list(self, align_list):
  137. for i in align_list:
  138. self.PosAlignList.append(PosAlign(i))
  139. def pos_align_list2dict(self):
  140. j = 1
  141. for i in self.PosAlignList:
  142. self.pos_align_dict[j] = i
  143. j = j + 1
  144. class FSSPAlignDict(dict):
  145. def __init__(self):
  146. # The following two dictionaries are pointers to records in self
  147. # The first dictionary is a "pdb_residue_number: self_key"
  148. # The second dictionary is a "absolute_residue_number: self_key"
  149. self.pdb_res_dict = {}
  150. self.abs_res_dict = {}
  151. self.data = {}
  152. def build_resnum_list(self):
  153. for i in self:
  154. self.abs_res_dict[self[i].abs_res_num] = i
  155. self.pdb_res_dict[self[i].pdb_res_num] = i
  156. # Given an absolute residue number & chain, returns the relevant fssp
  157. # record
  158. def abs(self, num):
  159. return self[self.abs_res_dict[num]]
  160. # Given an PDB residue number & chain, returns the relevant fssp
  161. # record
  162. def pdb(self, num):
  163. return self[self.pdb_res_dict[num]]
  164. # Returns a sequence string
  165. def sequence(self, num):
  166. s = ''
  167. for i in sorted(self.abs_res_dict):
  168. s += self.abs(i).pos_align_dict[num].aa
  169. return s
  170. def fasta_mult_align(self):
  171. mult_align_dict = {}
  172. for j in self.abs(1).pos_align_dict:
  173. mult_align_dict[j] = ''
  174. for fssp_rec in self.values():
  175. for j in fssp_rec.pos_align_dict:
  176. mult_align_dict[j] += fssp_rec.pos_align_dict[j].aa
  177. out_str = ''
  178. for i in sorted(mult_align_dict):
  179. out_str += '> %d\n' % i
  180. k = 0
  181. for j in mult_align_dict[i]:
  182. k += 1
  183. if k % 72 == 0:
  184. out_str += '\n'
  185. out_str += j
  186. out_str += '\n'
  187. return out_str
  188. class FSSPSumDict(dict):
  189. pass
  190. #
  191. # Process a fssp file into its constituents. Return a 2-tuple containing
  192. # a list of FSSPSumRecs and a dictionary of alignment records.
  193. #
  194. def read_fssp(fssp_handle):
  195. header = FSSPHeader()
  196. sum_dict = FSSPSumDict()
  197. align_dict = FSSPAlignDict()
  198. curline = fssp_handle.readline()
  199. while not summary_title.match(curline):
  200. # Still in title
  201. header.fill_header(curline)
  202. curline = fssp_handle.readline()
  203. if not summary_title.match(curline):
  204. raise ValueError('Bad FSSP file: no summary record found')
  205. curline = fssp_handle.readline() # Read the title line, discard
  206. curline = fssp_handle.readline() # Read the next line
  207. # Process the summary records into a list
  208. while summary_rec.match(curline):
  209. cur_sum_rec = FSSPSumRec(curline)
  210. sum_dict[cur_sum_rec.nr] = cur_sum_rec
  211. curline = fssp_handle.readline()
  212. # Outer loop: process everything up to the EQUIVALENCES title record
  213. while not equiv_title.match(curline):
  214. while (not alignments_title.match(curline) and
  215. not equiv_title.match(curline)):
  216. curline = fssp_handle.readline()
  217. if not alignments_title.match(curline):
  218. if equiv_title.match(curline):
  219. # print("Reached equiv_title")
  220. break
  221. else:
  222. raise ValueError('Bad FSSP file: no alignments title record found')
  223. if equiv_title.match(curline):
  224. break
  225. # If we got to this point, this means that we have matched an
  226. # alignments title. Parse the alignment records in a loop.
  227. curline = fssp_handle.readline() # Read the title line, discard
  228. curline = fssp_handle.readline() # Read the next line
  229. while alignments_rec.match(curline):
  230. align_rec = FSSPAlignRec(fff_rec(curline))
  231. key = align_rec.chain_id + align_rec.res_name + str(align_rec.pdb_res_num)
  232. align_list = curline[fssp_rec.align.start_aa_list:].strip().split()
  233. if key not in align_dict:
  234. align_dict[key] = align_rec
  235. align_dict[key].add_align_list(align_list)
  236. curline = fssp_handle.readline()
  237. if not curline:
  238. print('EOFEOFEOF')
  239. raise EOFError
  240. for i in align_dict.values():
  241. i.pos_align_list2dict()
  242. del i.PosAlignList
  243. align_dict.build_resnum_list()
  244. return (header, sum_dict, align_dict)