PageRenderTime 44ms CodeModel.GetById 22ms app.highlight 19ms RepoModel.GetById 0ms app.codeStats 0ms

/pipviewer-0.3.9/pipviewer/pselfiles.py

#
Python | 351 lines | 230 code | 40 blank | 81 comment | 11 complexity | 05edeedb703fdca6d8d8cd15c0d67669 MD5 | raw file
Possible License(s): GPL-3.0
  1#!/usr/bin/python
  2#  Copyright (C) 2006-2007 Free Software Foundation
  3
  4#  This program is free software: you can redistribute it and/or modify
  5#  it under the terms of the GNU General Public License as published by
  6#  the Free Software Foundation, either version 3 of the License, or
  7#  (at your option) any later version.
  8
  9#  This program is distributed in the hope that it will be useful,
 10#  but WITHOUT ANY WARRANTY; without even the implied warranty of
 11#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12#  GNU General Public License for more details.
 13
 14#  You should have received a copy of the GNU General Public License
 15#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16
 17
 18""" Managment of the input files """
 19
 20#############################################################
 21#            I M P O R T I N G   M O D U L E S
 22#############################################################
 23
 24# our files are just pickled dicts
 25from cPickle import load, dump
 26
 27import os
 28from copy import copy
 29from time import time
 30import re
 31import random
 32
 33
 34#############################################################
 35#              G L O B A L   C O N S T A N T S
 36#############################################################
 37
 38# By design, to easy manipulation by external scripts, the input files
 39# are just pickled dicts.  We will do out best to keep the format as
 40# stable as possible so you can free play with its content directly
 41# without using an opaque interface.  Internally, we reffer to the
 42# content of loaded file as a "mapping".
 43
 44# The mapping features a multiple alignment and "annotations" on this
 45# alignment that the user can add.  Those annotations are (start, stop)
 46# tuples describing regions.  Region can be selected probes,
 47# break-points, ... (more on this later)
 48
 49# The description in this file refers to a single version of the file
 50# format.  If you need to use a legacy file, look at the content of
 51# this file in a passed release of this software.
 52FILE_FORMAT_VERSION = 6
 53VERSION_KEY = "format-version"
 54
 55# The official extention stands for Probe SELector
 56FILE_FORMAT_EXT = ".psel"
 57
 58# The minimal length of the probes's names
 59NAMES_MIN_LENGTH = 3
 60
 61#############################################################
 62#                     K E Y S
 63#############################################################
 64
 65# The region types, regions are (start, stop) tuples with the Python
 66# slice semantic.
 67PROBES = "probes" # selected probes
 68BREAKS = "break-points" # low conservation (not really used)
 69TRASHS = "trash-regions" # good conservation but poor score as a probe
 70REGION_TYPES = [PROBES, BREAKS, TRASHS]
 71
 72# The mapping contains a multiple alignment, we need the alignment
 73# sequence for each species and the species name.  Those will point to
 74# list of strings
 75ALIGN = "aligned-sequences"
 76SPECIES = "species-names"
 77PROBES_IDS = "probes-ids"
 78PROBES_IDS_LENGTH = "probes-ids-length"
 79
 80# To compute the probe scores we need the ref genomes, either refseq
 81# number or the genome files.  Those keys are optional and will point to
 82# dict with integers as key.  We use the index of the species in the
 83# multiples alignment to map to its genome.
 84REFSEQ_IDS = "ref-seq-ids"
 85GENES_ANNOTATIONS = "genes-annotations"
 86
 87# TODO: those are not implemented yet
 88#GENBANKS   = "genbank-files"
 89#FASTAS     = "fasta-files"
 90
 91
 92#############################################################
 93#                  E X C E P T I O N S
 94#############################################################
 95
 96class FileFormatError(Exception):
 97    pass
 98
 99class MappingFormatError(Exception):
100    pass
101    
102class InvalidFormatError(Exception):
103    pass
104
105
106#############################################################
107#                          I / O
108#############################################################
109
110def load_mapping(file):
111    """ Return the hashMap containing the alignment data. Take a filename 
112    or an opened file as parameter. 
113    """
114    f = file
115    try:
116        f.read
117    except AttributeError:
118        f = open(file)
119    
120    try:
121        mapping = load(f)
122    except:
123        raise FileFormatError("file isn't loadable by the pickel module")
124
125    if mapping[VERSION_KEY] < FILE_FORMAT_VERSION:
126        raise FileFormatError("file format version is deprecated")
127    
128    if mapping[VERSION_KEY] != FILE_FORMAT_VERSION:
129        raise FileFormatError("file format version not supported")
130
131    return mapping
132    
133
134def save_mapping(mapping, file):
135    """ Save the data contain in the mapping. Take a filename or an opened
136    file as parameter (mapping is a dict).
137    """
138    f = file
139    try:
140        f.write
141    except AttributeError:
142        f = open(file, "w")
143
144    # minimal validation
145    for k in [ALIGN, SPECIES, REFSEQ_IDS, PROBES_IDS, PROBES, BREAKS, TRASHS, PROBES_IDS_LENGTH]:
146        if not mapping.has_key(k):
147            raise MappingFormatError("mapping lacks the " + k +" key")
148
149    # we use proto 0 since Windows seems to have problem with the
150    # latest proto.
151    versionned_mapping = copy(mapping)
152    versionned_mapping[VERSION_KEY] = FILE_FORMAT_VERSION
153
154    # for various reasons its a good idea to sort the annotations
155    for k in REGION_TYPES:
156        versionned_mapping[k].sort()
157
158    dump(versionned_mapping, f, 0)
159    
160
161def set_empty_mapping(align, species):
162    """ Return the default mapping containing the minimal data witch are the
163    species names, the multiple alignment and the probes' name minimal length.
164    Take the multiple alignment and the species list as parameters. 
165    """
166    mapping = { VERSION_KEY:FILE_FORMAT_VERSION,
167                ALIGN:align,
168                SPECIES:species,
169                REFSEQ_IDS:{},
170                PROBES_IDS:{},
171                PROBES:[],
172                BREAKS:[],
173                TRASHS:[],
174                PROBES_IDS_LENGTH:NAMES_MIN_LENGTH,
175                GENES_ANNOTATIONS:[] }
176    return mapping
177
178
179def import_multipipmaker_file(file):
180    """ Import a multipipmaker multiple alignement file. The content of 
181    the file is read and then converted into a valid mapping.
182    """  
183    data = open(file).read()
184    mapping = mapping_from_multipipmaker(data)
185    return mapping
186
187
188def import_clustalw_file(file):
189    """ Import a clustalw multiple alignement file. The content of 
190    the file is read and then converted into a valid mapping
191    """  
192    data = open(file).read()
193    mapping = mapping_from_clustalw(data)
194    return mapping
195
196
197def mapping_from_multipipmaker(data):
198    """ Convert a multipipmaker multiple alignement into a valid
199    mapping. Data is the content of the ASCII produced by multipipmaker.
200    At this stage we can't parse the pdf or postscript file.
201    """
202    # FIXME: will this work on mac ?
203    lines = filter(lambda l:l, map(lambda l:l.strip(), data.split("\n")))
204    nb_seq, length = map(int, lines[0].split(" "))
205
206    # from nb_seq to end is the align
207    align = lines[nb_seq+1:]
208    species = lines[1:nb_seq+1]
209    
210    # Validation of the Pipmaker Format
211    if nb_seq != len(align) or len(species) != len(align):
212        raise InvalidFormatError("The file is not a valid Pipmaker format")
213    
214    for line in align:
215        if len(line) != length:
216            raise InvalidFormatError("The file is not a valid Pipmaker format")
217    
218    return set_empty_mapping(align, species)
219
220
221def mapping_from_clustalw(data):
222    """ Convert a clustalW multiple alignement into a valid mapping.
223    Data is the content of te ASCII produced by clustalW (.aln file).
224    """
225    lines = [l.replace("\r", "") for l in data.split("\n")]
226    if not re.match(r"CLUSTAL (W|X) \(\d\.\d+\)", lines[0]):
227        raise InvalidFormatError("The file is not a valid ClustalW format")
228    
229    # TODO: this parsing will not work on macs
230    blocks = []
231    cur_block = []
232    for l in lines[3:]:
233        if l == "":
234            blocks.append(cur_block)
235            cur_block = []
236        else:
237            cur_block.append(l)
238    
239    species = []
240    align = []
241    for line in blocks[0]:
242        if line[0] != " ":
243            species.append(line.split()[0])
244            align.append(line.split()[1])
245    
246    nb_seq = len(species)
247    
248    for i in range(nb_seq):
249        j = 1
250        while j < len(blocks):
251            align[i] = align[i] + blocks[j][i].split()[1]
252            j = j+1
253    seq_length = len(align[0])
254    
255    # Validation of the clustal format
256    if nb_seq != len(species):
257        raise InvalidFormatError("The file is not a valid ClustalW format")
258    
259    for seq in align:
260        if len(seq) != seq_length:
261            raise InvalidFormatError("The file is not a valid  Clustal format")
262    
263    return set_empty_mapping(align, species)
264
265 
266#############################################################
267#                        U T I L S
268#############################################################
269
270def conservation(mapping, pos):
271    """ Return a value in [0..1] for the conservation.  pos is a 0
272    based position on the multiple alignment.  No bound checking is
273    done. Gaps are never considered conserved and we take the
274    nucleotide that appears the most, not necessarily the one on the
275    ref genome.
276    """
277    occ = {}
278    vals = mapping[ALIGN]
279    for s in vals:
280        occ[s[pos]] = occ.get(s[pos], 0) + 1
281    return 1.0*max([occ.get(n, 0) for n in ["A", "C", "G", "T"]])/len(vals)
282
283
284def ref_genome_seq(mapping, start, stop):
285    """ Return the ref genome sequence between start and stop.
286    Position are on the multiple alignment and gaps are trimed.
287    """
288    return mapping[ALIGN][0][start:stop].replace("-", "")
289
290      
291def generate_probe_name(wordLength):
292    """ Return a random name compose of a altenate vowel an consomne. Take 
293    the name length as parameter.
294    """
295    VOWEL = "aeiouy"
296    CONSOMNE = "qwrtpsdfghjklzxcvbnm"
297    
298    random.seed(time())
299    name = "P"
300    for i in range(wordLength):
301        if i%2 == 0:
302            name = name + random.choice(VOWEL)
303        else:
304            name = name + random.choice(CONSOMNE)
305    
306    return name
307    
308   
309def next_probeName(mapping, start, stop):
310    """ Generate name for a probe sequence and add it to the mapping. Return
311    the modified mapping. Take the unmodified mapping and the starting and
312    ending position of the probe sequence as parameters.
313    """
314    # Define the max limit before updating the name length
315    MAX_LIMIT = 5
316    
317    wordLength = mapping[PROBES_IDS_LENGTH]
318    limit = 0
319    probeNames = []
320    keys = []
321    currentKey = (start, stop)
322    
323    # Generate a probe name
324    newName = generate_probe_name(wordLength)
325    
326    for key in mapping[PROBES_IDS]:
327        probeNames.append(mapping[PROBES_IDS][key])
328        keys.append(key)
329    
330    # Return the unmodified mapping if the probe name is already define
331    if currentKey in keys:
332        return mapping
333    
334    # Add the probe name to the mapping if it don't exist
335    else:
336        while newName in probeNames and key != currentKey:
337            newName = generate_probe_name(wordLength)
338            limit = limit+1
339            if limit == MAX_LIMIT:
340                wordLength = wordLength+1
341        mapping[PROBES_IDS_LENGTH] = wordLength
342        mapping[PROBES_IDS][currentKey] = newName
343        return mapping
344    
345
346# simple self test
347if __name__ == "__main__":
348    import sys
349    load_mapping(sys.argv[1])
350
351