/Bio/SeqUtils/ProtParam.py

http://github.com/biopython/biopython · Python · 340 lines · 163 code · 36 blank · 141 comment · 17 complexity · 66689baa960636e4cc64341dc91e549d MD5 · raw file

  1. # Copyright 2003 Yair Benita. All rights reserved.
  2. # This file is part of the Biopython distribution and governed by your
  3. # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
  4. # Please see the LICENSE file that should have been included as part of this
  5. # package.
  6. """Simple protein analysis.
  7. Examples
  8. --------
  9. >>> from Bio.SeqUtils.ProtParam import ProteinAnalysis
  10. >>> X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGT"
  11. ... "RDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEEC"
  12. ... "LFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILF"
  13. ... "LPLPV")
  14. >>> print(X.count_amino_acids()['A'])
  15. 6
  16. >>> print(X.count_amino_acids()['E'])
  17. 12
  18. >>> print("%0.2f" % X.get_amino_acids_percent()['A'])
  19. 0.04
  20. >>> print("%0.2f" % X.get_amino_acids_percent()['L'])
  21. 0.12
  22. >>> print("%0.2f" % X.molecular_weight())
  23. 17103.16
  24. >>> print("%0.2f" % X.aromaticity())
  25. 0.10
  26. >>> print("%0.2f" % X.instability_index())
  27. 41.98
  28. >>> print("%0.2f" % X.isoelectric_point())
  29. 7.72
  30. >>> sec_struc = X.secondary_structure_fraction() # [helix, turn, sheet]
  31. >>> print("%0.2f" % sec_struc[0]) # helix
  32. 0.28
  33. >>> epsilon_prot = X.molar_extinction_coefficient() # [reduced, oxidized]
  34. >>> print(epsilon_prot[0]) # with reduced cysteines
  35. 17420
  36. >>> print(epsilon_prot[1]) # with disulfid bridges
  37. 17545
  38. Other public methods are:
  39. - gravy
  40. - protein_scale
  41. - flexibility
  42. - charge_at_pH
  43. """
  44. import sys
  45. from Bio.SeqUtils import ProtParamData # Local
  46. from Bio.SeqUtils import IsoelectricPoint # Local
  47. from Bio.Seq import Seq
  48. from Bio.Alphabet import IUPAC
  49. from Bio.Data import IUPACData
  50. from Bio.SeqUtils import molecular_weight
  51. class ProteinAnalysis:
  52. """Class containing methods for protein analysis.
  53. The constructor takes two arguments.
  54. The first is the protein sequence as a string, which is then converted to a
  55. sequence object using the Bio.Seq module. This is done just to make sure
  56. the sequence is a protein sequence and not anything else.
  57. The second argument is optional. If set to True, the weight of the amino
  58. acids will be calculated using their monoisotopic mass (the weight of the
  59. most abundant isotopes for each element), instead of the average molecular
  60. mass (the averaged weight of all stable isotopes for each element).
  61. If set to false (the default value) or left out, the IUPAC average
  62. molecular mass will be used for the calculation.
  63. """
  64. def __init__(self, prot_sequence, monoisotopic=False):
  65. """Initialize the class."""
  66. if prot_sequence.islower():
  67. self.sequence = Seq(prot_sequence.upper(), IUPAC.protein)
  68. else:
  69. self.sequence = Seq(prot_sequence, IUPAC.protein)
  70. self.amino_acids_content = None
  71. self.amino_acids_percent = None
  72. self.length = len(self.sequence)
  73. self.monoisotopic = monoisotopic
  74. def count_amino_acids(self):
  75. """Count standard amino acids, return a dict.
  76. Counts the number times each amino acid is in the protein
  77. sequence. Returns a dictionary {AminoAcid:Number}.
  78. The return value is cached in self.amino_acids_content.
  79. It is not recalculated upon subsequent calls.
  80. """
  81. if self.amino_acids_content is None:
  82. prot_dic = {k: 0 for k in IUPACData.protein_letters}
  83. for aa in prot_dic:
  84. prot_dic[aa] = self.sequence.count(aa)
  85. self.amino_acids_content = prot_dic
  86. return self.amino_acids_content
  87. def get_amino_acids_percent(self):
  88. """Calculate the amino acid content in percentages.
  89. The same as count_amino_acids only returns the Number in percentage of
  90. entire sequence. Returns a dictionary of {AminoAcid:percentage}.
  91. The return value is cached in self.amino_acids_percent.
  92. input is the dictionary self.amino_acids_content.
  93. output is a dictionary with amino acids as keys.
  94. """
  95. if self.amino_acids_percent is None:
  96. aa_counts = self.count_amino_acids()
  97. percentages = {}
  98. for aa in aa_counts:
  99. percentages[aa] = aa_counts[aa] / float(self.length)
  100. self.amino_acids_percent = percentages
  101. return self.amino_acids_percent
  102. def molecular_weight(self):
  103. """Calculate MW from Protein sequence."""
  104. return molecular_weight(self.sequence, monoisotopic=self.monoisotopic)
  105. def aromaticity(self):
  106. """Calculate the aromaticity according to Lobry, 1994.
  107. Calculates the aromaticity value of a protein according to Lobry, 1994.
  108. It is simply the relative frequency of Phe+Trp+Tyr.
  109. """
  110. aromatic_aas = "YWF"
  111. aa_percentages = self.get_amino_acids_percent()
  112. aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas)
  113. return aromaticity
  114. def instability_index(self):
  115. """Calculate the instability index according to Guruprasad et al 1990.
  116. Implementation of the method of Guruprasad et al. 1990 to test a
  117. protein for stability. Any value above 40 means the protein is unstable
  118. (has a short half life).
  119. See: Guruprasad K., Reddy B.V.B., Pandit M.W.
  120. Protein Engineering 4:155-161(1990).
  121. """
  122. index = ProtParamData.DIWV
  123. score = 0.0
  124. for i in range(self.length - 1):
  125. this, next = self.sequence[i : i + 2]
  126. dipeptide_value = index[this][next]
  127. score += dipeptide_value
  128. return (10.0 / self.length) * score
  129. def flexibility(self):
  130. """Calculate the flexibility according to Vihinen, 1994.
  131. No argument to change window size because parameters are specific for
  132. a window=9. The parameters used are optimized for determining the
  133. flexibility.
  134. """
  135. flexibilities = ProtParamData.Flex
  136. window_size = 9
  137. weights = [0.25, 0.4375, 0.625, 0.8125, 1]
  138. scores = []
  139. for i in range(self.length - window_size):
  140. subsequence = self.sequence[i : i + window_size]
  141. score = 0.0
  142. for j in range(window_size // 2):
  143. front = subsequence[j]
  144. back = subsequence[window_size - j - 1]
  145. score += (flexibilities[front] + flexibilities[back]) * weights[j]
  146. middle = subsequence[window_size // 2 + 1]
  147. score += flexibilities[middle]
  148. scores.append(score / 5.25)
  149. return scores
  150. def gravy(self):
  151. """Calculate the gravy according to Kyte and Doolittle."""
  152. total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence)
  153. return total_gravy / self.length
  154. def _weight_list(self, window, edge):
  155. """Make list of relative weight of window edges (PRIVATE).
  156. The relative weight of window edges are compared to the window
  157. center. The weights are linear. It actually generates half a list.
  158. For a window of size 9 and edge 0.4 you get a list of
  159. [0.4, 0.55, 0.7, 0.85].
  160. """
  161. unit = 2 * (1.0 - edge) / (window - 1)
  162. weights = [0.0] * (window // 2)
  163. for i in range(window // 2):
  164. weights[i] = edge + unit * i
  165. return weights
  166. def protein_scale(self, param_dict, window, edge=1.0):
  167. """Compute a profile by any amino acid scale.
  168. An amino acid scale is defined by a numerical value assigned to each
  169. type of amino acid. The most frequently used scales are the
  170. hydrophobicity or hydrophilicity scales and the secondary structure
  171. conformational parameters scales, but many other scales exist which
  172. are based on different chemical and physical properties of the
  173. amino acids. You can set several parameters that control the
  174. computation of a scale profile, such as the window size and the window
  175. edge relative weight value.
  176. WindowSize: The window size is the length of the interval to use for
  177. the profile computation. For a window size n, we use the i-(n-1)/2
  178. neighboring residues on each side to compute the score for residue i.
  179. The score for residue i is the sum of the scaled values for these
  180. amino acids, optionally weighted according to their position in the
  181. window.
  182. Edge: The central amino acid of the window always has a weight of 1.
  183. By default, the amino acids at the remaining window positions have the
  184. same weight, but you can make the residue at the center of the window
  185. have a larger weight than the others by setting the edge value for the
  186. residues at the beginning and end of the interval to a value between
  187. 0 and 1. For instance, for Edge=0.4 and a window size of 5 the weights
  188. will be: 0.4, 0.7, 1.0, 0.7, 0.4.
  189. The method returns a list of values which can be plotted to view the
  190. change along a protein sequence. Many scales exist. Just add your
  191. favorites to the ProtParamData modules.
  192. Similar to expasy's ProtScale:
  193. http://www.expasy.org/cgi-bin/protscale.pl
  194. """
  195. # generate the weights
  196. # _weight_list returns only one tail. If the list should be
  197. # [0.4,0.7,1.0,0.7,0.4] what you actually get from _weights_list
  198. # is [0.4,0.7]. The correct calculation is done in the loop.
  199. weights = self._weight_list(window, edge)
  200. scores = []
  201. # the score in each Window is divided by the sum of weights
  202. # (* 2 + 1) since the weight list is one sided:
  203. sum_of_weights = sum(weights) * 2 + 1
  204. for i in range(self.length - window + 1):
  205. subsequence = self.sequence[i : i + window]
  206. score = 0.0
  207. for j in range(window // 2):
  208. # walk from the outside of the Window towards the middle.
  209. # Iddo: try/except clauses added to avoid raising an exception
  210. # on a non-standard amino acid
  211. try:
  212. front = param_dict[subsequence[j]]
  213. back = param_dict[subsequence[window - j - 1]]
  214. score += weights[j] * front + weights[j] * back
  215. except KeyError:
  216. sys.stderr.write(
  217. "warning: %s or %s is not a standard "
  218. "amino acid.\n" % (subsequence[j], subsequence[window - j - 1])
  219. )
  220. # Now add the middle value, which always has a weight of 1.
  221. middle = subsequence[window // 2]
  222. if middle in param_dict:
  223. score += param_dict[middle]
  224. else:
  225. sys.stderr.write(
  226. "warning: %s is not a standard amino acid.\n" % middle
  227. )
  228. scores.append(score / sum_of_weights)
  229. return scores
  230. def isoelectric_point(self):
  231. """Calculate the isoelectric point.
  232. Uses the module IsoelectricPoint to calculate the pI of a protein.
  233. """
  234. aa_content = self.count_amino_acids()
  235. ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
  236. return ie_point.pi()
  237. def charge_at_pH(self, pH):
  238. """Calculate the charge of a protein at given pH."""
  239. aa_content = self.count_amino_acids()
  240. charge = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
  241. return charge.charge_at_pH(pH)
  242. def secondary_structure_fraction(self):
  243. """Calculate fraction of helix, turn and sheet.
  244. Returns a list of the fraction of amino acids which tend
  245. to be in Helix, Turn or Sheet.
  246. Amino acids in helix: V, I, Y, F, W, L.
  247. Amino acids in Turn: N, P, G, S.
  248. Amino acids in sheet: E, M, A, L.
  249. Returns a tuple of three floats (Helix, Turn, Sheet).
  250. """
  251. aa_percentages = self.get_amino_acids_percent()
  252. helix = sum(aa_percentages[r] for r in "VIYFWL")
  253. turn = sum(aa_percentages[r] for r in "NPGS")
  254. sheet = sum(aa_percentages[r] for r in "EMAL")
  255. return helix, turn, sheet
  256. def molar_extinction_coefficient(self):
  257. """Calculate the molar extinction coefficient.
  258. Calculates the molar extinction coefficient assuming cysteines
  259. (reduced) and cystines residues (Cys-Cys-bond)
  260. """
  261. num_aa = self.count_amino_acids()
  262. mec_reduced = num_aa["W"] * 5500 + num_aa["Y"] * 1490
  263. mec_cystines = mec_reduced + (num_aa["C"] // 2) * 125
  264. return (mec_reduced, mec_cystines)
  265. if __name__ == "__main__":
  266. from Bio._utils import run_doctest
  267. run_doctest()