axt_to_lav.py - This is a Python script that converts an AX…

/tools/filters/axt_to_lav.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 176 lines · 124 code · 37 blank · 15 comment · 28 complexity · 529af9fc08e343352fdd8bbfdc037321 MD5 · raw file


#!/usr/bin/env python
"""
Application to convert AXT file to LAV file
-------------------------------------------

:Author: Bob Harris (rsharris@bx.psu.edu)
:Version: $Revision: $

The application reads an AXT file from standard input and writes a LAV file to
standard out;  some statistics are written to standard error.
"""

import sys, copy
from galaxy import eggs
import pkg_resources
pkg_resources.require( "bx-python" )
import bx.align.axt
import bx.align.lav

assert sys.version_info[:2] >= ( 2, 4 )

def usage(s=None):
    message = """
axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file
  Each spec is of the form seq_file[:species_name]:lengths_file.

  seq_file should be a format string for the file names for the individual
  sequences, with %s to be replaced by the alignment's src field.  For example,
  "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib",
  etc.

  species_name is optional.  If present, it is prepended to the alignment's src
  field.

  Lengths files provide the length of each chromosome (lav format needs this
  information but axt file does not contain it).  The format is a series of
  lines of the form
    <chromosome name> <length>
  The chromosome field in each axt block must match some <chromosome name> in
  the lengths file.
"""
    if (s == None): sys.exit (message)
    else:           sys.exit ("%s\n%s" % (s,message))


def main():
    global debug

    # parse the command line

    primary   = None
    secondary = None
    silent    = False

    # pick off options

    args = sys.argv[1:]
    seq_file2 = open(args.pop(-1),'w')
    seq_file1 = open(args.pop(-1),'w')
    lav_out = args.pop(-1)
    axt_in = args.pop(-1)
    while (len(args) > 0):
        arg = args.pop(0)
        val = None
        fields = arg.split("=",1)
        if (len(fields) == 2):
            arg = fields[0]
            val = fields[1]
            if (val == ""):
                usage("missing a value in %s=" % arg)

        if (arg == "--silent") and (val == None):
            silent = True
        elif (primary == None) and (val == None):
            primary = arg
        elif (secondary == None) and (val == None):
            secondary = arg
        else:
            usage("unknown argument: %s" % arg)

    if (primary == None):
        usage("missing primary file name and length")

    if (secondary == None):
        usage("missing secondary file name and length")

    try:
        (primaryFile,primary,primaryLengths) = parse_spec(primary)
    except:
        usage("bad primary spec (must be seq_file[:species_name]:lengths_file")

    try:
        (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary)
    except:
        usage("bad secondary spec (must be seq_file[:species_name]:lengths_file")

    # read the lengths

    speciesToLengths = {}
    speciesToLengths[primary]   = read_lengths (primaryLengths)
    speciesToLengths[secondary] = read_lengths (secondaryLengths)

    # read the alignments

    out = bx.align.lav.Writer(open(lav_out,'w'), \
            attributes = { "name_format_1" : primaryFile,
                           "name_format_2" : secondaryFile })

    axtsRead = 0
    axtsWritten = 0
    for axtBlock in bx.align.axt.Reader(open(axt_in), \
            species_to_lengths = speciesToLengths,
            species1           = primary,
            species2           = secondary,
            support_ids        = True):
        axtsRead += 1
        out.write (axtBlock)
        primary_c = axtBlock.get_component_by_src_start(primary)
        secondary_c = axtBlock.get_component_by_src_start(secondary)
        
        print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size)
        print >>seq_file1,primary_c.text
        print >>seq_file1
        
        print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size)
        print >>seq_file2,secondary_c.text
        print >>seq_file2
        axtsWritten += 1

    out.close()
    seq_file1.close()
    seq_file2.close()

    if (not silent):
        sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten))

def parse_spec(spec): # returns (seq_file,species_name,lengths_file)
    fields = spec.split(":")
    if   (len(fields) == 2): return (fields[0],"",fields[1])
    elif (len(fields) == 3): return (fields[0],fields[1],fields[2])
    else:                    raise ValueError

def read_lengths (fileName):

    chromToLength = {}

    f = file (fileName, "r")

    for lineNumber,line in enumerate(f):
        line = line.strip()
        if (line == ""): continue
        if (line.startswith("#")): continue

        fields = line.split ()
        if (len(fields) != 2):
            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)

        chrom = fields[0]
        try:
            length = int(fields[1])
        except:
            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)

        if (chrom in chromToLength):
            raise "%s appears more than once (%s:%d): %s" \
                % (chrom,fileName,lineNumber)

        chromToLength[chrom] = length

    f.close ()

    return chromToLength


if __name__ == "__main__": main()

Summary ✨

This is a Python script that converts an AXT file to a LAV file, which is a format used for alignments in the LAST software package. The script takes two arguments: the name of the AXT file and the name of the output LAV file. It also requires two specs for the primary and secondary sequences, which specify the file names and lengths of the sequences. The script reads the AXT file and writes a LAV file, while also writing the aligned sequences to separate files.

Tech Fingerprint

Alerts (20)

'== None' Use 'is' for None comparisons (e.g., x is None)
42 72 74 76 81 84
'def' Ensure functions have docstrings for documentation
46 137 143
'global' Avoid global variables; use function parameters or class attributes for better scope management
47
'open(' Use 'with open()' to ensure Files are properly closed
58 59 105 111
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
89 94 161
Complexity hotspot; lines 139 to 141 (total complexity: 3)
139 140 141