PageRenderTime 50ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/pipviewer-0.3.9/pipviewer/pselfiles.py

#
Python | 351 lines | 230 code | 40 blank | 81 comment | 11 complexity | 05edeedb703fdca6d8d8cd15c0d67669 MD5 | raw file
Possible License(s): GPL-3.0
  1. #!/usr/bin/python
  2. # Copyright (C) 2006-2007 Free Software Foundation
  3. # This program is free software: you can redistribute it and/or modify
  4. # it under the terms of the GNU General Public License as published by
  5. # the Free Software Foundation, either version 3 of the License, or
  6. # (at your option) any later version.
  7. # This program is distributed in the hope that it will be useful,
  8. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. # GNU General Public License for more details.
  11. # You should have received a copy of the GNU General Public License
  12. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. """ Managment of the input files """
  14. #############################################################
  15. # I M P O R T I N G M O D U L E S
  16. #############################################################
  17. # our files are just pickled dicts
  18. from cPickle import load, dump
  19. import os
  20. from copy import copy
  21. from time import time
  22. import re
  23. import random
  24. #############################################################
  25. # G L O B A L C O N S T A N T S
  26. #############################################################
  27. # By design, to easy manipulation by external scripts, the input files
  28. # are just pickled dicts. We will do out best to keep the format as
  29. # stable as possible so you can free play with its content directly
  30. # without using an opaque interface. Internally, we reffer to the
  31. # content of loaded file as a "mapping".
  32. # The mapping features a multiple alignment and "annotations" on this
  33. # alignment that the user can add. Those annotations are (start, stop)
  34. # tuples describing regions. Region can be selected probes,
  35. # break-points, ... (more on this later)
  36. # The description in this file refers to a single version of the file
  37. # format. If you need to use a legacy file, look at the content of
  38. # this file in a passed release of this software.
  39. FILE_FORMAT_VERSION = 6
  40. VERSION_KEY = "format-version"
  41. # The official extention stands for Probe SELector
  42. FILE_FORMAT_EXT = ".psel"
  43. # The minimal length of the probes's names
  44. NAMES_MIN_LENGTH = 3
  45. #############################################################
  46. # K E Y S
  47. #############################################################
  48. # The region types, regions are (start, stop) tuples with the Python
  49. # slice semantic.
  50. PROBES = "probes" # selected probes
  51. BREAKS = "break-points" # low conservation (not really used)
  52. TRASHS = "trash-regions" # good conservation but poor score as a probe
  53. REGION_TYPES = [PROBES, BREAKS, TRASHS]
  54. # The mapping contains a multiple alignment, we need the alignment
  55. # sequence for each species and the species name. Those will point to
  56. # list of strings
  57. ALIGN = "aligned-sequences"
  58. SPECIES = "species-names"
  59. PROBES_IDS = "probes-ids"
  60. PROBES_IDS_LENGTH = "probes-ids-length"
  61. # To compute the probe scores we need the ref genomes, either refseq
  62. # number or the genome files. Those keys are optional and will point to
  63. # dict with integers as key. We use the index of the species in the
  64. # multiples alignment to map to its genome.
  65. REFSEQ_IDS = "ref-seq-ids"
  66. GENES_ANNOTATIONS = "genes-annotations"
  67. # TODO: those are not implemented yet
  68. #GENBANKS = "genbank-files"
  69. #FASTAS = "fasta-files"
  70. #############################################################
  71. # E X C E P T I O N S
  72. #############################################################
  73. class FileFormatError(Exception):
  74. pass
  75. class MappingFormatError(Exception):
  76. pass
  77. class InvalidFormatError(Exception):
  78. pass
  79. #############################################################
  80. # I / O
  81. #############################################################
  82. def load_mapping(file):
  83. """ Return the hashMap containing the alignment data. Take a filename
  84. or an opened file as parameter.
  85. """
  86. f = file
  87. try:
  88. f.read
  89. except AttributeError:
  90. f = open(file)
  91. try:
  92. mapping = load(f)
  93. except:
  94. raise FileFormatError("file isn't loadable by the pickel module")
  95. if mapping[VERSION_KEY] < FILE_FORMAT_VERSION:
  96. raise FileFormatError("file format version is deprecated")
  97. if mapping[VERSION_KEY] != FILE_FORMAT_VERSION:
  98. raise FileFormatError("file format version not supported")
  99. return mapping
  100. def save_mapping(mapping, file):
  101. """ Save the data contain in the mapping. Take a filename or an opened
  102. file as parameter (mapping is a dict).
  103. """
  104. f = file
  105. try:
  106. f.write
  107. except AttributeError:
  108. f = open(file, "w")
  109. # minimal validation
  110. for k in [ALIGN, SPECIES, REFSEQ_IDS, PROBES_IDS, PROBES, BREAKS, TRASHS, PROBES_IDS_LENGTH]:
  111. if not mapping.has_key(k):
  112. raise MappingFormatError("mapping lacks the " + k +" key")
  113. # we use proto 0 since Windows seems to have problem with the
  114. # latest proto.
  115. versionned_mapping = copy(mapping)
  116. versionned_mapping[VERSION_KEY] = FILE_FORMAT_VERSION
  117. # for various reasons its a good idea to sort the annotations
  118. for k in REGION_TYPES:
  119. versionned_mapping[k].sort()
  120. dump(versionned_mapping, f, 0)
  121. def set_empty_mapping(align, species):
  122. """ Return the default mapping containing the minimal data witch are the
  123. species names, the multiple alignment and the probes' name minimal length.
  124. Take the multiple alignment and the species list as parameters.
  125. """
  126. mapping = { VERSION_KEY:FILE_FORMAT_VERSION,
  127. ALIGN:align,
  128. SPECIES:species,
  129. REFSEQ_IDS:{},
  130. PROBES_IDS:{},
  131. PROBES:[],
  132. BREAKS:[],
  133. TRASHS:[],
  134. PROBES_IDS_LENGTH:NAMES_MIN_LENGTH,
  135. GENES_ANNOTATIONS:[] }
  136. return mapping
  137. def import_multipipmaker_file(file):
  138. """ Import a multipipmaker multiple alignement file. The content of
  139. the file is read and then converted into a valid mapping.
  140. """
  141. data = open(file).read()
  142. mapping = mapping_from_multipipmaker(data)
  143. return mapping
  144. def import_clustalw_file(file):
  145. """ Import a clustalw multiple alignement file. The content of
  146. the file is read and then converted into a valid mapping
  147. """
  148. data = open(file).read()
  149. mapping = mapping_from_clustalw(data)
  150. return mapping
  151. def mapping_from_multipipmaker(data):
  152. """ Convert a multipipmaker multiple alignement into a valid
  153. mapping. Data is the content of the ASCII produced by multipipmaker.
  154. At this stage we can't parse the pdf or postscript file.
  155. """
  156. # FIXME: will this work on mac ?
  157. lines = filter(lambda l:l, map(lambda l:l.strip(), data.split("\n")))
  158. nb_seq, length = map(int, lines[0].split(" "))
  159. # from nb_seq to end is the align
  160. align = lines[nb_seq+1:]
  161. species = lines[1:nb_seq+1]
  162. # Validation of the Pipmaker Format
  163. if nb_seq != len(align) or len(species) != len(align):
  164. raise InvalidFormatError("The file is not a valid Pipmaker format")
  165. for line in align:
  166. if len(line) != length:
  167. raise InvalidFormatError("The file is not a valid Pipmaker format")
  168. return set_empty_mapping(align, species)
  169. def mapping_from_clustalw(data):
  170. """ Convert a clustalW multiple alignement into a valid mapping.
  171. Data is the content of te ASCII produced by clustalW (.aln file).
  172. """
  173. lines = [l.replace("\r", "") for l in data.split("\n")]
  174. if not re.match(r"CLUSTAL (W|X) \(\d\.\d+\)", lines[0]):
  175. raise InvalidFormatError("The file is not a valid ClustalW format")
  176. # TODO: this parsing will not work on macs
  177. blocks = []
  178. cur_block = []
  179. for l in lines[3:]:
  180. if l == "":
  181. blocks.append(cur_block)
  182. cur_block = []
  183. else:
  184. cur_block.append(l)
  185. species = []
  186. align = []
  187. for line in blocks[0]:
  188. if line[0] != " ":
  189. species.append(line.split()[0])
  190. align.append(line.split()[1])
  191. nb_seq = len(species)
  192. for i in range(nb_seq):
  193. j = 1
  194. while j < len(blocks):
  195. align[i] = align[i] + blocks[j][i].split()[1]
  196. j = j+1
  197. seq_length = len(align[0])
  198. # Validation of the clustal format
  199. if nb_seq != len(species):
  200. raise InvalidFormatError("The file is not a valid ClustalW format")
  201. for seq in align:
  202. if len(seq) != seq_length:
  203. raise InvalidFormatError("The file is not a valid Clustal format")
  204. return set_empty_mapping(align, species)
  205. #############################################################
  206. # U T I L S
  207. #############################################################
  208. def conservation(mapping, pos):
  209. """ Return a value in [0..1] for the conservation. pos is a 0
  210. based position on the multiple alignment. No bound checking is
  211. done. Gaps are never considered conserved and we take the
  212. nucleotide that appears the most, not necessarily the one on the
  213. ref genome.
  214. """
  215. occ = {}
  216. vals = mapping[ALIGN]
  217. for s in vals:
  218. occ[s[pos]] = occ.get(s[pos], 0) + 1
  219. return 1.0*max([occ.get(n, 0) for n in ["A", "C", "G", "T"]])/len(vals)
  220. def ref_genome_seq(mapping, start, stop):
  221. """ Return the ref genome sequence between start and stop.
  222. Position are on the multiple alignment and gaps are trimed.
  223. """
  224. return mapping[ALIGN][0][start:stop].replace("-", "")
  225. def generate_probe_name(wordLength):
  226. """ Return a random name compose of a altenate vowel an consomne. Take
  227. the name length as parameter.
  228. """
  229. VOWEL = "aeiouy"
  230. CONSOMNE = "qwrtpsdfghjklzxcvbnm"
  231. random.seed(time())
  232. name = "P"
  233. for i in range(wordLength):
  234. if i%2 == 0:
  235. name = name + random.choice(VOWEL)
  236. else:
  237. name = name + random.choice(CONSOMNE)
  238. return name
  239. def next_probeName(mapping, start, stop):
  240. """ Generate name for a probe sequence and add it to the mapping. Return
  241. the modified mapping. Take the unmodified mapping and the starting and
  242. ending position of the probe sequence as parameters.
  243. """
  244. # Define the max limit before updating the name length
  245. MAX_LIMIT = 5
  246. wordLength = mapping[PROBES_IDS_LENGTH]
  247. limit = 0
  248. probeNames = []
  249. keys = []
  250. currentKey = (start, stop)
  251. # Generate a probe name
  252. newName = generate_probe_name(wordLength)
  253. for key in mapping[PROBES_IDS]:
  254. probeNames.append(mapping[PROBES_IDS][key])
  255. keys.append(key)
  256. # Return the unmodified mapping if the probe name is already define
  257. if currentKey in keys:
  258. return mapping
  259. # Add the probe name to the mapping if it don't exist
  260. else:
  261. while newName in probeNames and key != currentKey:
  262. newName = generate_probe_name(wordLength)
  263. limit = limit+1
  264. if limit == MAX_LIMIT:
  265. wordLength = wordLength+1
  266. mapping[PROBES_IDS_LENGTH] = wordLength
  267. mapping[PROBES_IDS][currentKey] = newName
  268. return mapping
  269. # simple self test
  270. if __name__ == "__main__":
  271. import sys
  272. load_mapping(sys.argv[1])