gff_filter_by_feature_count.py - This Python script filters…

/tools/filters/gff/gff_filter_by_feature_count.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 88 lines · 61 code · 10 blank · 17 comment · 14 complexity · 200da793eaa81023b16e6f824fc6709a MD5 · raw file


#!/usr/bin/env python
"""
Filter a gff file using a criterion based on feature counts for a transcript.

Usage:
%prog input_name output_name feature_name condition
"""
import sys
from galaxy import eggs
from galaxy.datatypes.util.gff_util import GFFReaderWrapper
from bx.intervals.io import GenomicInterval

# Valid operators, ordered so that complex operators (e.g. '>=') are
# recognized before simple operators (e.g. '>')
ops = [
    '>=',
    '<=',
    '<',
    '>',
    '==',
    '!='
]

# Escape sequences for valid operators.
mapped_ops = {
    '__ge__': ops[0],
    '__le__': ops[1],
    '__lt__': ops[2],
    '__gt__': ops[3],
    '__eq__': ops[4],
    '__ne__': ops[5],
}


def __main__():
    # Get args.
    input_name = sys.argv[1]
    output_name = sys.argv[2]
    feature_name = sys.argv[3]
    condition = sys.argv[4]
    
    # Unescape operations in condition str.
    for key, value in mapped_ops.items():
        condition = condition.replace( key, value )
    
    # Error checking: condition should be of the form <operator><number>
    for op in ops:
        if op in condition:
            empty, number_str = condition.split( op )
            try:
                number = float( number_str )
            except:
                number = None
            if empty != "" or not number:
                print >> sys.stderr, "Invalid condition: %s, cannot filter." % condition
                return
            break

    # Do filtering.
    kept_features = 0
    skipped_lines = 0
    first_skipped_line = 0
    out = open( output_name, 'w' )
    for i, feature in enumerate( GFFReaderWrapper( open( input_name ) ) ):
        if not isinstance( feature, GenomicInterval ):
            continue
        count = 0
        for interval in feature.intervals:
            if interval.feature == feature_name:
                count += 1
        if eval( '%s %s' % ( count, condition ) ):
            # Keep feature.
            for interval in feature.intervals:
                out.write( "\t".join(interval.fields) + '\n' )
            kept_features += 1

    # Needed because i is 0-based but want to display stats using 1-based.
    i += 1

    # Clean up.
    out.close()
    info_msg = "%i of %i features kept (%.2f%%) using condition %s.  " % \
        ( kept_features, i, float(kept_features)/i * 100.0, feature_name + condition )
    if skipped_lines > 0:
        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
    print info_msg

if __name__ == "__main__": __main__()

Summary ✨

This Python script filters a GFF file based on a criterion for feature counts within transcripts. It takes four arguments: the input GFF file, the output file name, the feature name to filter by, and the condition for filtering (e.g., “>5”). The script uses eval() to evaluate the condition and checks if the number of features with the specified name in each transcript satisfies the condition. If it does, the feature is kept; otherwise, it is skipped. The script also reports the number of kept features, skipped lines, and the filtering condition used.

Tech Fingerprint

Alerts (5)

'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
52
'open(' Use 'with open()' to ensure Files are properly closed
63 64
'isinstance(' Overuse may indicate design issues; consider polymorphism
65
'eval(' Avoid due to security risks; use ast.literal_eval for safer evaluation of literals
71