trim-leaders-fastq.py - Copyright (c) 2020 Hyeshik Chang Pe…

/nanopore/_scripts/trim-leaders-fastq.py

https://github.com/hyeshik/sars-cov-2-transcriptome · Python · 100 lines · 58 code · 19 blank · 23 comment · 17 complexity · 45ea7b0bcc611bde1618e8f025a4498e MD5 · raw file


#!/usr/bin/env python3
#
# Copyright (c) 2020 Hyeshik Chang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#

import pysam
from itertools import groupby
import re
import sys
import pandas as pd
import glob
from Bio import SeqIO
import gzip

LEADER_TRIMMING_UPPER_LIMIT = 100

cigar_pat = re.compile('\\d+[A-Z]')
input_bam = sys.argv[1]
input_fn = sys.argv[2]
output_fn = sys.argv[3]

def get_sam_reads_by_group(filename):
    get_subject_name = lambda aln: aln.reference_name

    with pysam.AlignmentFile(filename) as alnfile:
        for qname, alns in groupby(alnfile, key=get_subject_name):
            yield qname, list(alns)

def get_alignment_lengths(cigar):
    tokens = cigar_pat.findall(cigar)

    left_clip = int(tokens[0][:-1]) if tokens[0][-1] == 'H' else 0
    right_clip = int(tokens[-1][:-1]) if tokens[-1][-1] == 'H' else 0

    refpos = readpos = 0

    for tok in tokens:
        op = tok[-1]
        length = int(tok[:-1])
        if op in 'MI':
            readpos += length
        if op in 'MDN':
            refpos += length
        if op == 'S':
            raise ValueError('S is not handled.')

    return readpos, refpos, left_clip, right_clip

leader_matches = []
for filename in glob.glob('alignments/VeroInf24h.viral-leader.blast/part-*.bam'):
    #print(filename)
    for qname, alns in get_sam_reads_by_group(filename):
        if len(alns) > 1:
            continue

        for aln in alns:
            readstart = aln.reference_start
            salnwidth, qalnwidth, lclip, rclip = get_alignment_lengths(aln.cigarstring)

            vg_range = lclip, lclip + salnwidth
            rd_range = readstart, readstart + qalnwidth

            leader_matches.append([aln.reference_name, rd_range[0], rd_range[1],
                                   vg_range[0], vg_range[1], lclip, rclip])

leader_matches = \
  pd.DataFrame(leader_matches, columns=['read_id', 'read_start', 'read_end',
                                        'query_start', 'query_end',
                                        'left_clip', 'right_clip'])

high_quality_matches = \
    leader_matches[(leader_matches['read_end'] < LEADER_TRIMMING_UPPER_LIMIT) &
                   (leader_matches['right_clip'] < 1)]

trimming_length = high_quality_matches.set_index('read_id')['read_end'].to_dict()

with gzip.open(output_fn, 'wt') as wrtfile:
    for seq in SeqIO.parse(gzip.open(input_fn, 'rt'), 'fastq'):
        trimlen = trimming_length.get(seq.name)
        if trimlen is not None:
            SeqIO.write([seq[trimlen:]], wrtfile, 'fastq')

Alerts (8)

'def' Ensure functions have docstrings for documentation
40 47
'list(' Avoid unnecessary list conversions; use generators where possible
45
Complexity hotspot; lines 50 to 51 (total complexity: 4)
50 51
'print(' Use logging module for better control and configurability
69
'gzip.open(' Potential decompression bomb vulnerability in Python code if input is untrusted; ensure to limit the number of bytes read.
95 96