/pipviewer-0.3.9/pipviewer/pselfiles.py
Python | 351 lines | 230 code | 40 blank | 81 comment | 11 complexity | 05edeedb703fdca6d8d8cd15c0d67669 MD5 | raw file
Possible License(s): GPL-3.0
1#!/usr/bin/python
2# Copyright (C) 2006-2007 Free Software Foundation
3
4# This program is free software: you can redistribute it and/or modify
5# it under the terms of the GNU General Public License as published by
6# the Free Software Foundation, either version 3 of the License, or
7# (at your option) any later version.
8
9# This program is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12# GNU General Public License for more details.
13
14# You should have received a copy of the GNU General Public License
15# along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17
18""" Managment of the input files """
19
20#############################################################
21# I M P O R T I N G M O D U L E S
22#############################################################
23
24# our files are just pickled dicts
25from cPickle import load, dump
26
27import os
28from copy import copy
29from time import time
30import re
31import random
32
33
34#############################################################
35# G L O B A L C O N S T A N T S
36#############################################################
37
38# By design, to easy manipulation by external scripts, the input files
39# are just pickled dicts. We will do out best to keep the format as
40# stable as possible so you can free play with its content directly
41# without using an opaque interface. Internally, we reffer to the
42# content of loaded file as a "mapping".
43
44# The mapping features a multiple alignment and "annotations" on this
45# alignment that the user can add. Those annotations are (start, stop)
46# tuples describing regions. Region can be selected probes,
47# break-points, ... (more on this later)
48
49# The description in this file refers to a single version of the file
50# format. If you need to use a legacy file, look at the content of
51# this file in a passed release of this software.
52FILE_FORMAT_VERSION = 6
53VERSION_KEY = "format-version"
54
55# The official extention stands for Probe SELector
56FILE_FORMAT_EXT = ".psel"
57
58# The minimal length of the probes's names
59NAMES_MIN_LENGTH = 3
60
61#############################################################
62# K E Y S
63#############################################################
64
65# The region types, regions are (start, stop) tuples with the Python
66# slice semantic.
67PROBES = "probes" # selected probes
68BREAKS = "break-points" # low conservation (not really used)
69TRASHS = "trash-regions" # good conservation but poor score as a probe
70REGION_TYPES = [PROBES, BREAKS, TRASHS]
71
72# The mapping contains a multiple alignment, we need the alignment
73# sequence for each species and the species name. Those will point to
74# list of strings
75ALIGN = "aligned-sequences"
76SPECIES = "species-names"
77PROBES_IDS = "probes-ids"
78PROBES_IDS_LENGTH = "probes-ids-length"
79
80# To compute the probe scores we need the ref genomes, either refseq
81# number or the genome files. Those keys are optional and will point to
82# dict with integers as key. We use the index of the species in the
83# multiples alignment to map to its genome.
84REFSEQ_IDS = "ref-seq-ids"
85GENES_ANNOTATIONS = "genes-annotations"
86
87# TODO: those are not implemented yet
88#GENBANKS = "genbank-files"
89#FASTAS = "fasta-files"
90
91
92#############################################################
93# E X C E P T I O N S
94#############################################################
95
96class FileFormatError(Exception):
97 pass
98
99class MappingFormatError(Exception):
100 pass
101
102class InvalidFormatError(Exception):
103 pass
104
105
106#############################################################
107# I / O
108#############################################################
109
110def load_mapping(file):
111 """ Return the hashMap containing the alignment data. Take a filename
112 or an opened file as parameter.
113 """
114 f = file
115 try:
116 f.read
117 except AttributeError:
118 f = open(file)
119
120 try:
121 mapping = load(f)
122 except:
123 raise FileFormatError("file isn't loadable by the pickel module")
124
125 if mapping[VERSION_KEY] < FILE_FORMAT_VERSION:
126 raise FileFormatError("file format version is deprecated")
127
128 if mapping[VERSION_KEY] != FILE_FORMAT_VERSION:
129 raise FileFormatError("file format version not supported")
130
131 return mapping
132
133
134def save_mapping(mapping, file):
135 """ Save the data contain in the mapping. Take a filename or an opened
136 file as parameter (mapping is a dict).
137 """
138 f = file
139 try:
140 f.write
141 except AttributeError:
142 f = open(file, "w")
143
144 # minimal validation
145 for k in [ALIGN, SPECIES, REFSEQ_IDS, PROBES_IDS, PROBES, BREAKS, TRASHS, PROBES_IDS_LENGTH]:
146 if not mapping.has_key(k):
147 raise MappingFormatError("mapping lacks the " + k +" key")
148
149 # we use proto 0 since Windows seems to have problem with the
150 # latest proto.
151 versionned_mapping = copy(mapping)
152 versionned_mapping[VERSION_KEY] = FILE_FORMAT_VERSION
153
154 # for various reasons its a good idea to sort the annotations
155 for k in REGION_TYPES:
156 versionned_mapping[k].sort()
157
158 dump(versionned_mapping, f, 0)
159
160
161def set_empty_mapping(align, species):
162 """ Return the default mapping containing the minimal data witch are the
163 species names, the multiple alignment and the probes' name minimal length.
164 Take the multiple alignment and the species list as parameters.
165 """
166 mapping = { VERSION_KEY:FILE_FORMAT_VERSION,
167 ALIGN:align,
168 SPECIES:species,
169 REFSEQ_IDS:{},
170 PROBES_IDS:{},
171 PROBES:[],
172 BREAKS:[],
173 TRASHS:[],
174 PROBES_IDS_LENGTH:NAMES_MIN_LENGTH,
175 GENES_ANNOTATIONS:[] }
176 return mapping
177
178
179def import_multipipmaker_file(file):
180 """ Import a multipipmaker multiple alignement file. The content of
181 the file is read and then converted into a valid mapping.
182 """
183 data = open(file).read()
184 mapping = mapping_from_multipipmaker(data)
185 return mapping
186
187
188def import_clustalw_file(file):
189 """ Import a clustalw multiple alignement file. The content of
190 the file is read and then converted into a valid mapping
191 """
192 data = open(file).read()
193 mapping = mapping_from_clustalw(data)
194 return mapping
195
196
197def mapping_from_multipipmaker(data):
198 """ Convert a multipipmaker multiple alignement into a valid
199 mapping. Data is the content of the ASCII produced by multipipmaker.
200 At this stage we can't parse the pdf or postscript file.
201 """
202 # FIXME: will this work on mac ?
203 lines = filter(lambda l:l, map(lambda l:l.strip(), data.split("\n")))
204 nb_seq, length = map(int, lines[0].split(" "))
205
206 # from nb_seq to end is the align
207 align = lines[nb_seq+1:]
208 species = lines[1:nb_seq+1]
209
210 # Validation of the Pipmaker Format
211 if nb_seq != len(align) or len(species) != len(align):
212 raise InvalidFormatError("The file is not a valid Pipmaker format")
213
214 for line in align:
215 if len(line) != length:
216 raise InvalidFormatError("The file is not a valid Pipmaker format")
217
218 return set_empty_mapping(align, species)
219
220
221def mapping_from_clustalw(data):
222 """ Convert a clustalW multiple alignement into a valid mapping.
223 Data is the content of te ASCII produced by clustalW (.aln file).
224 """
225 lines = [l.replace("\r", "") for l in data.split("\n")]
226 if not re.match(r"CLUSTAL (W|X) \(\d\.\d+\)", lines[0]):
227 raise InvalidFormatError("The file is not a valid ClustalW format")
228
229 # TODO: this parsing will not work on macs
230 blocks = []
231 cur_block = []
232 for l in lines[3:]:
233 if l == "":
234 blocks.append(cur_block)
235 cur_block = []
236 else:
237 cur_block.append(l)
238
239 species = []
240 align = []
241 for line in blocks[0]:
242 if line[0] != " ":
243 species.append(line.split()[0])
244 align.append(line.split()[1])
245
246 nb_seq = len(species)
247
248 for i in range(nb_seq):
249 j = 1
250 while j < len(blocks):
251 align[i] = align[i] + blocks[j][i].split()[1]
252 j = j+1
253 seq_length = len(align[0])
254
255 # Validation of the clustal format
256 if nb_seq != len(species):
257 raise InvalidFormatError("The file is not a valid ClustalW format")
258
259 for seq in align:
260 if len(seq) != seq_length:
261 raise InvalidFormatError("The file is not a valid Clustal format")
262
263 return set_empty_mapping(align, species)
264
265
266#############################################################
267# U T I L S
268#############################################################
269
270def conservation(mapping, pos):
271 """ Return a value in [0..1] for the conservation. pos is a 0
272 based position on the multiple alignment. No bound checking is
273 done. Gaps are never considered conserved and we take the
274 nucleotide that appears the most, not necessarily the one on the
275 ref genome.
276 """
277 occ = {}
278 vals = mapping[ALIGN]
279 for s in vals:
280 occ[s[pos]] = occ.get(s[pos], 0) + 1
281 return 1.0*max([occ.get(n, 0) for n in ["A", "C", "G", "T"]])/len(vals)
282
283
284def ref_genome_seq(mapping, start, stop):
285 """ Return the ref genome sequence between start and stop.
286 Position are on the multiple alignment and gaps are trimed.
287 """
288 return mapping[ALIGN][0][start:stop].replace("-", "")
289
290
291def generate_probe_name(wordLength):
292 """ Return a random name compose of a altenate vowel an consomne. Take
293 the name length as parameter.
294 """
295 VOWEL = "aeiouy"
296 CONSOMNE = "qwrtpsdfghjklzxcvbnm"
297
298 random.seed(time())
299 name = "P"
300 for i in range(wordLength):
301 if i%2 == 0:
302 name = name + random.choice(VOWEL)
303 else:
304 name = name + random.choice(CONSOMNE)
305
306 return name
307
308
309def next_probeName(mapping, start, stop):
310 """ Generate name for a probe sequence and add it to the mapping. Return
311 the modified mapping. Take the unmodified mapping and the starting and
312 ending position of the probe sequence as parameters.
313 """
314 # Define the max limit before updating the name length
315 MAX_LIMIT = 5
316
317 wordLength = mapping[PROBES_IDS_LENGTH]
318 limit = 0
319 probeNames = []
320 keys = []
321 currentKey = (start, stop)
322
323 # Generate a probe name
324 newName = generate_probe_name(wordLength)
325
326 for key in mapping[PROBES_IDS]:
327 probeNames.append(mapping[PROBES_IDS][key])
328 keys.append(key)
329
330 # Return the unmodified mapping if the probe name is already define
331 if currentKey in keys:
332 return mapping
333
334 # Add the probe name to the mapping if it don't exist
335 else:
336 while newName in probeNames and key != currentKey:
337 newName = generate_probe_name(wordLength)
338 limit = limit+1
339 if limit == MAX_LIMIT:
340 wordLength = wordLength+1
341 mapping[PROBES_IDS_LENGTH] = wordLength
342 mapping[PROBES_IDS][currentKey] = newName
343 return mapping
344
345
346# simple self test
347if __name__ == "__main__":
348 import sys
349 load_mapping(sys.argv[1])
350
351