PageRenderTime 58ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/proteogenomics/ProteinStatistics.py

https://bitbucket.org/andreyto/proteogenomics
Python | 267 lines | 236 code | 3 blank | 28 comment | 0 complexity | 00d79b3809bcdd68fd3b07c6621d1d2d MD5 | raw file
Possible License(s): AGPL-1.0
  1. ###############################################################################
  2. # #
  3. # Copyright (c) 2009 J. Craig Venter Institute. #
  4. # All rights reserved. #
  5. # #
  6. ###############################################################################
  7. # #
  8. # This program is free software: you can redistribute it and/or modify #
  9. # it under the terms of the GNU General Public License as published by #
  10. # the Free Software Foundation, either version 3 of the License, or #
  11. # (at your option) any later version. #
  12. # #
  13. # This program is distributed in the hope that it will be useful, #
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of #
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
  16. # GNU General Public License for more details. #
  17. # #
  18. # You should have received a copy of the GNU General Public License #
  19. # along with this program. If not, see <http://www.gnu.org/licenses/>. #
  20. # #
  21. ###############################################################################
  22. """These are for functions on protein sequences. Right now it's only hydropathy, but maybe more will come up
  23. """
  24. import BasicStats
  25. import Global
  26. import string
  27. import math
  28. class HydrophobicIndex:
  29. """scores Kyte-Doolittle, grouping a la Lehninger"""
  30. HI = {}
  31. #NonPolar
  32. HI["G"] = -0.4 #Glycine
  33. HI["A"] = 1.8 #Alanine
  34. HI["P"] = -1.6 #Proline #### Typo in Lenhinger!!!
  35. HI["V"] = 4.2 #Valine
  36. HI["L"] = 3.8 #Leucine
  37. HI["I"] = 4.5 #Isoleucine
  38. HI["M"] = 1.9 #Methionine
  39. #Aromatic
  40. HI["F"] = 2.8 #Phenylalanine
  41. HI["Y"] = -1.3 #Tyrosine
  42. HI["W"] = -0.9 #Tryptophan
  43. #polar, uncharged
  44. HI["S"] = -0.8 #Serine
  45. HI["T"] = -0.7 #Threonine
  46. HI["C"] = 2.5 #Cysteine
  47. HI["N"] = -3.5 #Asparagine
  48. HI["Q"] = -3.5 #Glutamine
  49. #polar, charged
  50. HI["K"] = -3.9 #Lysine
  51. HI["H"] = -3.2 #Histidine
  52. HI["R"] = -4.5 #Arginine
  53. HI["D"] = -3.5 #Aspartate
  54. HI["E"] = -3.5 #Glutamate
  55. class HyrdopathyPlot:
  56. def __init__(self):
  57. self.Metric = HydrophobicIndex
  58. def MakePlot(self, Sequence, BinLen = 5):
  59. """given a sequence, do the plot. you can supply a bin len, but it's your
  60. job to figure out what a good length is. I read somewhere 5-7.
  61. """
  62. EffectiveLength = len(Sequence) - BinLen + 1 #plus one to get the last bin
  63. Plot = []
  64. for Index in range(EffectiveLength):
  65. #now look at the kmer starting at Index
  66. Values = []
  67. for Jndex in range(Index, Index+BinLen):
  68. Letter = Sequence[Jndex]
  69. if not Letter in (string.uppercase):
  70. print "I'm barfing on %s"%Sequence
  71. continue
  72. Values.append( self.Metric.HI[Letter])
  73. Mean = BasicStats.GetMean(Values)
  74. Plot.append(Mean)
  75. return Plot
  76. def IsConsistentSignal(self, List, Min = 0.5, Len = 10):
  77. """I am trying to see if here is a consistent hydrophobic patch in this
  78. plot. So for standards, we count positive values, at least 10 in a row
  79. """
  80. Counter = 0
  81. Index = -1 #start at -1 so I can increment right off the bat
  82. StartIndex = -1 #start of the hydrophobic patch
  83. for Item in List:
  84. Index += 1 #index into the array
  85. if Item > Min:
  86. if not Counter:
  87. #this is the first one in a row
  88. StartIndex = Index
  89. Counter += 1
  90. if Counter >= Len:
  91. return StartIndex
  92. else:
  93. Counter =0
  94. StartIndex =-1
  95. return -1 # not found
  96. def GetMW(Aminos):
  97. Aminos = Aminos.upper()
  98. MW = 0
  99. for Amino in Aminos:
  100. MW += Global.AminoMass.get(Amino, 0)
  101. MW += 19
  102. return MW
  103. def HasSignalPeptidaseMotif(Aminos):
  104. """Looking for the end of the peptide (the prefix to the observed protein) to have
  105. AxB.. where A in [ILVAGS] and B in [AGS]
  106. """
  107. APosition = Aminos[-3]
  108. BPosition = Aminos[-1]
  109. AcceptableA = ["I","L","V","A","G","S"]
  110. AcceptableB = ["A","G","S"]
  111. if not APosition in AcceptableA:
  112. return 0
  113. if not BPosition in AcceptableB:
  114. return 0
  115. return 1
  116. def HasBasicResidue(Sequence, Start = 0, End = None):
  117. """
  118. return 0/1 if there is a basic residue in the sequence given,
  119. and the bracketed subsequence
  120. """
  121. SubSequence = Sequence
  122. if End:
  123. SubSequence = Sequence[Start:End]
  124. else:
  125. SubSequence = Sequence[Start:]
  126. Basic = ["R", "K"]
  127. for Letter in SubSequence:
  128. if Letter in Basic:
  129. return 1
  130. return 0
  131. class SequenceComplexity:
  132. """this class is meant to measure a growing number of sequence complexity traits
  133. in protein sequences.
  134. It is currently not functional, as the parameters and workflows need to change
  135. """
  136. def __init__(self):
  137. pass
  138. def CheckComplexity(self, genome):
  139. if self.Verbose:
  140. print "ProteogenomicsPostProcessing.py:CheckComplexity"
  141. LowMW = ["G", "A"]
  142. List = []
  143. for (chromName, chrom) in genome.chromosomes.items():
  144. for (orfName,ORF) in chrom.simpleOrfs.items() + chrom.pepOnlyOrfs.items():
  145. PeptideString = ""
  146. for PeptideObject in ORF.PeptideLocationList:
  147. PeptideString += PeptideObject.Aminos
  148. Count = 0
  149. for Letter in PeptideString:
  150. if Letter in LowMW:
  151. Count +=1
  152. Normalized = Count / float (len(PeptideString))
  153. List.append(Normalized)
  154. Histogram = BasicStats.CreateHistogram(List, 0, 0.05)
  155. BasicStats.PrettyPrintHistogram(Histogram, None)
  156. def SequenceComplexityHack(self, genome):
  157. """hack"""
  158. ORFList = []
  159. PeptideList = []
  160. DiffList = []
  161. RealProteinList = []
  162. for (chromName, chrom) in genome.chromosomes.items():
  163. for (orfName,ORF) in chrom.simpleOrfs.items() + chrom.pepOnlyOrfs.items():
  164. #we try for the predicted protein first
  165. Sequence = ORF.GetProteinSequence()
  166. if Sequence:
  167. Entropy = self.SequenceEntropy(Sequence)
  168. RealProteinList.append(Entropy)
  169. if not Sequence:
  170. Sequence = ORF.GetObservedSequence()
  171. ProteinEntropy = self.SequenceEntropy(Sequence)
  172. #now do for the peptides
  173. PeptideCat = ""
  174. for Peptide in ORF.peptideIter():
  175. PeptideCat += Peptide.GetAminos()
  176. PeptideEntropy = self.SequenceEntropy(PeptideCat)
  177. #put stuff in lists
  178. ORFList.append(ProteinEntropy)
  179. PeptideList.append(PeptideEntropy)
  180. Diff = ProteinEntropy - PeptideEntropy
  181. DiffList.append(Diff)
  182. ORFHandle = open("ORFEntropy.txt", "wb")
  183. ORFLine = "\t".join(map(str, ORFList))
  184. ORFHandle.write(ORFLine)
  185. ORFHandle.close()
  186. PeptideHandle = open("PeptideEntropy.txt", "wb")
  187. Line = "\t".join(map(str, PeptideList))
  188. PeptideHandle.write(Line)
  189. PeptideHandle.close()
  190. DiffHandle = open("DiffEntropy.txt", "wb")
  191. Line = "\t".join(map(str, DiffList))
  192. DiffHandle.write(Line)
  193. DiffHandle.close()
  194. RealHandle = open("RealProteinEntropy.txt", "wb")
  195. Line = "\t".join(map(str, RealProteinList))
  196. RealHandle.write(Line)
  197. RealHandle.close()
  198. def SequenceEntropy(self, Sequence):
  199. """Parameters: An amino acid sequence
  200. Return: the H(x) entropy
  201. Description: Use the classic information entropy equation to calculate
  202. the entropy of the input sequence.
  203. H(x) = SUM p(xi) * log(1/ p(xi))
  204. xi = letter of the sequence
  205. e.g. GGGAS
  206. x1 = G, p(G) = 3/5
  207. x2 = A, p(A) = 1/5
  208. X3 = S, p(S) = 1/5
  209. """
  210. ProbTable = self.GetProbabilityTable(Sequence)
  211. Sum =0
  212. for (Letter, Probability) in ProbTable.items():
  213. LogValue = math.log(1 / Probability) #currently the natural log. not sure the base of the log matters
  214. Sum += (Probability * LogValue)
  215. return Sum
  216. def GetProbabilityTable(self, Sequence):
  217. """Parameters: an amino acid sequence
  218. Return: a dictionary of probability (frequence/n) for each letter
  219. Description: just convert counts to probability. easy.
  220. """
  221. CountDict = {}
  222. ProbabilityDict = {}
  223. for Letter in Sequence:
  224. if not CountDict.has_key(Letter):
  225. CountDict[Letter] = 0 #initialize
  226. CountDict[Letter] += 1
  227. Len = float(len(Sequence)) #cast to float so we can do real division
  228. for (Key, Value) in CountDict.items():
  229. Probability = Value / Len
  230. ProbabilityDict[Key] = Probability
  231. return ProbabilityDict