PageRenderTime 27ms CodeModel.GetById 9ms app.highlight 14ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/filters/axt_to_lav.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 176 lines | 153 code | 12 blank | 11 comment | 7 complexity | 529af9fc08e343352fdd8bbfdc037321 MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3Application to convert AXT file to LAV file
  4-------------------------------------------
  5
  6:Author: Bob Harris (rsharris@bx.psu.edu)
  7:Version: $Revision: $
  8
  9The application reads an AXT file from standard input and writes a LAV file to
 10standard out;  some statistics are written to standard error.
 11"""
 12
 13import sys, copy
 14from galaxy import eggs
 15import pkg_resources
 16pkg_resources.require( "bx-python" )
 17import bx.align.axt
 18import bx.align.lav
 19
 20assert sys.version_info[:2] >= ( 2, 4 )
 21
 22def usage(s=None):
 23    message = """
 24axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file
 25  Each spec is of the form seq_file[:species_name]:lengths_file.
 26
 27  seq_file should be a format string for the file names for the individual
 28  sequences, with %s to be replaced by the alignment's src field.  For example,
 29  "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib",
 30  etc.
 31
 32  species_name is optional.  If present, it is prepended to the alignment's src
 33  field.
 34
 35  Lengths files provide the length of each chromosome (lav format needs this
 36  information but axt file does not contain it).  The format is a series of
 37  lines of the form
 38    <chromosome name> <length>
 39  The chromosome field in each axt block must match some <chromosome name> in
 40  the lengths file.
 41"""
 42    if (s == None): sys.exit (message)
 43    else:           sys.exit ("%s\n%s" % (s,message))
 44
 45
 46def main():
 47    global debug
 48
 49    # parse the command line
 50
 51    primary   = None
 52    secondary = None
 53    silent    = False
 54
 55    # pick off options
 56
 57    args = sys.argv[1:]
 58    seq_file2 = open(args.pop(-1),'w')
 59    seq_file1 = open(args.pop(-1),'w')
 60    lav_out = args.pop(-1)
 61    axt_in = args.pop(-1)
 62    while (len(args) > 0):
 63        arg = args.pop(0)
 64        val = None
 65        fields = arg.split("=",1)
 66        if (len(fields) == 2):
 67            arg = fields[0]
 68            val = fields[1]
 69            if (val == ""):
 70                usage("missing a value in %s=" % arg)
 71
 72        if (arg == "--silent") and (val == None):
 73            silent = True
 74        elif (primary == None) and (val == None):
 75            primary = arg
 76        elif (secondary == None) and (val == None):
 77            secondary = arg
 78        else:
 79            usage("unknown argument: %s" % arg)
 80
 81    if (primary == None):
 82        usage("missing primary file name and length")
 83
 84    if (secondary == None):
 85        usage("missing secondary file name and length")
 86
 87    try:
 88        (primaryFile,primary,primaryLengths) = parse_spec(primary)
 89    except:
 90        usage("bad primary spec (must be seq_file[:species_name]:lengths_file")
 91
 92    try:
 93        (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary)
 94    except:
 95        usage("bad secondary spec (must be seq_file[:species_name]:lengths_file")
 96
 97    # read the lengths
 98
 99    speciesToLengths = {}
100    speciesToLengths[primary]   = read_lengths (primaryLengths)
101    speciesToLengths[secondary] = read_lengths (secondaryLengths)
102
103    # read the alignments
104
105    out = bx.align.lav.Writer(open(lav_out,'w'), \
106            attributes = { "name_format_1" : primaryFile,
107                           "name_format_2" : secondaryFile })
108
109    axtsRead = 0
110    axtsWritten = 0
111    for axtBlock in bx.align.axt.Reader(open(axt_in), \
112            species_to_lengths = speciesToLengths,
113            species1           = primary,
114            species2           = secondary,
115            support_ids        = True):
116        axtsRead += 1
117        out.write (axtBlock)
118        primary_c = axtBlock.get_component_by_src_start(primary)
119        secondary_c = axtBlock.get_component_by_src_start(secondary)
120        
121        print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size)
122        print >>seq_file1,primary_c.text
123        print >>seq_file1
124        
125        print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size)
126        print >>seq_file2,secondary_c.text
127        print >>seq_file2
128        axtsWritten += 1
129
130    out.close()
131    seq_file1.close()
132    seq_file2.close()
133
134    if (not silent):
135        sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten))
136
137def parse_spec(spec): # returns (seq_file,species_name,lengths_file)
138    fields = spec.split(":")
139    if   (len(fields) == 2): return (fields[0],"",fields[1])
140    elif (len(fields) == 3): return (fields[0],fields[1],fields[2])
141    else:                    raise ValueError
142
143def read_lengths (fileName):
144
145    chromToLength = {}
146
147    f = file (fileName, "r")
148
149    for lineNumber,line in enumerate(f):
150        line = line.strip()
151        if (line == ""): continue
152        if (line.startswith("#")): continue
153
154        fields = line.split ()
155        if (len(fields) != 2):
156            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
157
158        chrom = fields[0]
159        try:
160            length = int(fields[1])
161        except:
162            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
163
164        if (chrom in chromToLength):
165            raise "%s appears more than once (%s:%d): %s" \
166                % (chrom,fileName,lineNumber)
167
168        chromToLength[chrom] = length
169
170    f.close ()
171
172    return chromToLength
173
174
175if __name__ == "__main__": main()
176