PageRenderTime 1537ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 0ms

/itmi_vcfqc_optimized_with_globus_transfer/pymodules/python2.7/lib/python/biopython-1.65-py2.7-linux-x86_64.egg/Bio/PDB/DSSP.py

https://gitlab.com/pooja043/Globus_Docker_4
Python | 346 lines | 298 code | 13 blank | 35 comment | 3 complexity | 0d4f6da108cbfe2edc877afb9325eaf1 MD5 | raw file
  1. # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
  2. # This code is part of the Biopython distribution and governed by its
  3. # license. Please see the LICENSE file that should have been included
  4. # as part of this package.
  5. """Use the DSSP program to calculate secondary structure and accessibility.
  6. You need to have a working version of DSSP (and a license, free for academic
  7. use) in order to use this. For DSSP, see U{http://swift.cmbi.ru.nl/gv/dssp/}.
  8. The DSSP codes for secondary structure used here are:
  9. - H Alpha helix (4-12)
  10. - B Isolated beta-bridge residue
  11. - E Strand
  12. - G 3-10 helix
  13. - I pi helix
  14. - T Turn
  15. - S Bend
  16. - - None
  17. """
  18. from __future__ import print_function
  19. __docformat__ = "restructuredtext en"
  20. import re
  21. from Bio._py3k import StringIO
  22. import subprocess
  23. from Bio.Data import SCOPData
  24. from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap
  25. from Bio.PDB.PDBExceptions import PDBException
  26. from Bio.PDB.PDBParser import PDBParser
  27. # Match C in DSSP
  28. _dssp_cys=re.compile('[a-z]')
  29. # Maximal ASA of amino acids
  30. # Values from Sander & Rost, (1994), Proteins, 20:216-226
  31. # Used for relative accessibility
  32. MAX_ACC={}
  33. MAX_ACC["ALA"]=106.0
  34. MAX_ACC["CYS"]=135.0
  35. MAX_ACC["ASP"]=163.0
  36. MAX_ACC["GLU"]=194.0
  37. MAX_ACC["PHE"]=197.0
  38. MAX_ACC["GLY"]=84.0
  39. MAX_ACC["HIS"]=184.0
  40. MAX_ACC["ILE"]=169.0
  41. MAX_ACC["LYS"]=205.0
  42. MAX_ACC["LEU"]=164.0
  43. MAX_ACC["MET"]=188.0
  44. MAX_ACC["ASN"]=157.0
  45. MAX_ACC["PRO"]=136.0
  46. MAX_ACC["GLN"]=198.0
  47. MAX_ACC["ARG"]=248.0
  48. MAX_ACC["SER"]=130.0
  49. MAX_ACC["THR"]=142.0
  50. MAX_ACC["VAL"]=142.0
  51. MAX_ACC["TRP"]=227.0
  52. MAX_ACC["TYR"]=222.0
  53. def ss_to_index(ss):
  54. """
  55. Secondary structure symbol to index.
  56. H=0
  57. E=1
  58. C=2
  59. """
  60. if ss=='H':
  61. return 0
  62. if ss=='E':
  63. return 1
  64. if ss=='C':
  65. return 2
  66. assert 0
  67. def dssp_dict_from_pdb_file(in_file, DSSP="dssp"):
  68. """
  69. Create a DSSP dictionary from a PDB file.
  70. Example:
  71. --------
  72. >>> dssp_dict=dssp_dict_from_pdb_file("1fat.pdb")
  73. >>> aa, ss, acc=dssp_dict[('A', 1)]
  74. ::
  75. @param in_file: pdb file
  76. @type in_file: string ::
  77. @param DSSP: DSSP executable (argument to os.system)
  78. @type DSSP: string ::
  79. @return: a dictionary that maps (chainid, resid) to
  80. amino acid type, secondary structure code and
  81. accessibility.
  82. @rtype: {}
  83. """
  84. # Using universal newlines is important on Python 3, this
  85. # gives unicode handles rather than bytes handles.
  86. p = subprocess.Popen([DSSP, in_file], universal_newlines=True,
  87. stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  88. out, err = p.communicate()
  89. out_dict, keys = _make_dssp_dict(StringIO(out))
  90. return out_dict, keys
  91. def make_dssp_dict(filename):
  92. """
  93. Return a DSSP dictionary that maps (chainid, resid) to
  94. aa, ss and accessibility, from a DSSP file. ::
  95. @param filename: the DSSP output file
  96. @type filename: string
  97. """
  98. with open(filename, "r") as handle:
  99. return _make_dssp_dict(handle)
  100. def _make_dssp_dict(handle):
  101. """
  102. Return a DSSP dictionary that maps (chainid, resid) to
  103. aa, ss and accessibility, from an open DSSP file object. ::
  104. @param handle: the open DSSP output file handle
  105. @type handle: file
  106. """
  107. dssp = {}
  108. start = 0
  109. keys = []
  110. for l in handle.readlines():
  111. sl = l.split()
  112. if len(sl) < 2:
  113. continue
  114. if sl[1] == "RESIDUE":
  115. # Start parsing from here
  116. start = 1
  117. continue
  118. if not start:
  119. continue
  120. if l[9] == " ":
  121. # Skip -- missing residue
  122. continue
  123. resseq = int(l[5:10])
  124. icode = l[10]
  125. chainid = l[11]
  126. aa = l[13]
  127. ss = l[16]
  128. if ss == " ":
  129. ss = "-"
  130. try:
  131. acc = int(l[34:38])
  132. phi = float(l[103:109])
  133. psi = float(l[109:115])
  134. except ValueError as exc:
  135. # DSSP output breaks its own format when there are >9999
  136. # residues, since only 4 digits are allocated to the seq num
  137. # field. See 3kic chain T res 321, 1vsy chain T res 6077.
  138. # Here, look for whitespace to figure out the number of extra
  139. # digits, and shift parsing the rest of the line by that amount.
  140. if l[34] != ' ':
  141. shift = l[34:].find(' ')
  142. acc = int((l[34+shift:38+shift]))
  143. phi = float(l[103+shift:109+shift])
  144. psi = float(l[109+shift:115+shift])
  145. else:
  146. raise ValueError(exc)
  147. res_id = (" ", resseq, icode)
  148. dssp[(chainid, res_id)] = (aa, ss, acc, phi, psi)
  149. keys.append((chainid, res_id))
  150. return dssp, keys
  151. class DSSP(AbstractResiduePropertyMap):
  152. """
  153. Run DSSP on a pdb file, and provide a handle to the
  154. DSSP secondary structure and accessibility.
  155. **Note** that DSSP can only handle one model.
  156. Example:
  157. --------
  158. >>> p = PDBParser()
  159. >>> structure = p.get_structure("1MOT", "1MOT.pdb")
  160. >>> model = structure[0]
  161. >>> dssp = DSSP(model, "1MOT.pdb")
  162. >>> # DSSP data is accessed by a tuple (chain_id, res_id)
  163. >>> a_key = list(dssp)[2]
  164. >>> # residue object, secondary structure, solvent accessibility,
  165. >>> # relative accessiblity, phi, psi
  166. >>> dssp[a_key]
  167. (<Residue ALA het= resseq=251 icode= >,
  168. 'H',
  169. 72,
  170. 0.67924528301886788,
  171. -61.200000000000003,
  172. -42.399999999999999)
  173. """
  174. def __init__(self, model, pdb_file, dssp="dssp"):
  175. """
  176. ::
  177. @param model: the first model of the structure
  178. @type model: L{Model} ::
  179. @param pdb_file: a PDB file
  180. @type pdb_file: string ::
  181. @param dssp: the dssp executable (ie. the argument to os.system)
  182. @type dssp: string
  183. """
  184. # create DSSP dictionary
  185. dssp_dict, dssp_keys = dssp_dict_from_pdb_file(pdb_file, dssp)
  186. dssp_map = {}
  187. dssp_list = []
  188. def resid2code(res_id):
  189. """Serialize a residue's resseq and icode for easy comparison."""
  190. return '%s%s' % (res_id[1], res_id[2])
  191. # Now create a dictionary that maps Residue objects to
  192. # secondary structure and accessibility, and a list of
  193. # (residue, (secondary structure, accessibility)) tuples
  194. for key in dssp_keys:
  195. chain_id, res_id = key
  196. chain = model[chain_id]
  197. try:
  198. res = chain[res_id]
  199. except KeyError:
  200. # In DSSP, HET field is not considered in residue identifier.
  201. # Thus HETATM records may cause unnecessary exceptions.
  202. # (See 3jui chain A res 593.)
  203. # Try the lookup again with all HETATM other than water
  204. res_seq_icode = resid2code(res_id)
  205. for r in chain:
  206. if r.id[0] not in (' ', 'W'):
  207. # Compare resseq + icode
  208. if resid2code(r.id) == res_seq_icode:
  209. # Found a matching residue
  210. res = r
  211. break
  212. else:
  213. raise KeyError(res_id)
  214. # For disordered residues of point mutations, BioPython uses the
  215. # last one as default, But DSSP takes the first one (alternative
  216. # location is blank, A or 1). See 1h9h chain E resi 22.
  217. # Here we select the res in which all atoms have altloc blank, A or
  218. # 1. If no such residues are found, simply use the first one appears
  219. # (as DSSP does).
  220. if res.is_disordered() == 2:
  221. for rk in res.disordered_get_id_list():
  222. # All atoms in the disordered residue should have the same
  223. # altloc, so it suffices to check the altloc of the first
  224. # atom.
  225. altloc = res.child_dict[rk].get_list()[0].get_altloc()
  226. if altloc in tuple('A1 '):
  227. res.disordered_select(rk)
  228. break
  229. else:
  230. # Simply select the first one
  231. res.disordered_select(res.disordered_get_id_list()[0])
  232. # Sometimes point mutations are put into HETATM and ATOM with altloc
  233. # 'A' and 'B'.
  234. # See 3piu chain A residue 273:
  235. # <Residue LLP het=H_LLP resseq=273 icode= >
  236. # <Residue LYS het= resseq=273 icode= >
  237. # DSSP uses the HETATM LLP as it has altloc 'A'
  238. # We check the altloc code here.
  239. elif res.is_disordered() == 1:
  240. # Check altloc of all atoms in the DisorderedResidue. If it
  241. # contains blank, A or 1, then use it. Otherwise, look for HET
  242. # residues of the same seq+icode. If not such HET residues are
  243. # found, just accept the current one.
  244. altlocs = set(a.get_altloc() for a in res.get_unpacked_list())
  245. if altlocs.isdisjoint('A1 '):
  246. # Try again with all HETATM other than water
  247. res_seq_icode = resid2code(res_id)
  248. for r in chain:
  249. if r.id[0] not in (' ', 'W'):
  250. if resid2code(r.id) == res_seq_icode and \
  251. r.get_list()[0].get_altloc() in tuple('A1 '):
  252. res = r
  253. break
  254. aa, ss, acc, phi, psi = dssp_dict[key]
  255. res.xtra["SS_DSSP"] = ss
  256. res.xtra["EXP_DSSP_ASA"] = acc
  257. res.xtra["PHI_DSSP"] = phi
  258. res.xtra["PSI_DSSP"] = psi
  259. # Relative accessibility
  260. resname = res.get_resname()
  261. try:
  262. rel_acc = acc/MAX_ACC[resname]
  263. except KeyError:
  264. # Invalid value for resname
  265. rel_acc = 'NA'
  266. else:
  267. if rel_acc > 1.0:
  268. rel_acc = 1.0
  269. res.xtra["EXP_DSSP_RASA"] = rel_acc
  270. # Verify if AA in DSSP == AA in Structure
  271. # Something went wrong if this is not true!
  272. # NB: DSSP uses X often
  273. resname = SCOPData.protein_letters_3to1.get(resname, 'X')
  274. if resname == "C":
  275. # DSSP renames C in C-bridges to a,b,c,d,...
  276. # - we rename it back to 'C'
  277. if _dssp_cys.match(aa):
  278. aa = 'C'
  279. # Take care of HETATM again
  280. if (resname != aa) and (res.id[0] == ' ' or aa != 'X'):
  281. raise PDBException("Structure/DSSP mismatch at %s" % res)
  282. dssp_map[key] = ((res, ss, acc, rel_acc, phi, psi))
  283. dssp_list.append((res, ss, acc, rel_acc, phi, psi))
  284. AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys,
  285. dssp_list)
  286. if __name__ == "__main__":
  287. import sys
  288. p = PDBParser()
  289. s = p.get_structure('X', sys.argv[1])
  290. model = s[0]
  291. d = DSSP(model, sys.argv[1])
  292. for r in d:
  293. print(r)
  294. print("Handled %i residues" % len(d))
  295. print(sorted(d))
  296. if ('A', 1) in d:
  297. print(d[('A', 1)])
  298. print(s[0]['A'][1].xtra)
  299. # Secondary structure
  300. print(''.join(item[1] for item in d))