PageRenderTime 33ms CodeModel.GetById 14ms app.highlight 14ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/filters/gff/gff_filter_by_feature_count.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 88 lines | 68 code | 7 blank | 13 comment | 8 complexity | 200da793eaa81023b16e6f824fc6709a MD5 | raw file
 1#!/usr/bin/env python
 2"""
 3Filter a gff file using a criterion based on feature counts for a transcript.
 4
 5Usage:
 6%prog input_name output_name feature_name condition
 7"""
 8import sys
 9from galaxy import eggs
10from galaxy.datatypes.util.gff_util import GFFReaderWrapper
11from bx.intervals.io import GenomicInterval
12
13# Valid operators, ordered so that complex operators (e.g. '>=') are
14# recognized before simple operators (e.g. '>')
15ops = [
16    '>=',
17    '<=',
18    '<',
19    '>',
20    '==',
21    '!='
22]
23
24# Escape sequences for valid operators.
25mapped_ops = {
26    '__ge__': ops[0],
27    '__le__': ops[1],
28    '__lt__': ops[2],
29    '__gt__': ops[3],
30    '__eq__': ops[4],
31    '__ne__': ops[5],
32}
33
34
35def __main__():
36    # Get args.
37    input_name = sys.argv[1]
38    output_name = sys.argv[2]
39    feature_name = sys.argv[3]
40    condition = sys.argv[4]
41    
42    # Unescape operations in condition str.
43    for key, value in mapped_ops.items():
44        condition = condition.replace( key, value )
45    
46    # Error checking: condition should be of the form <operator><number>
47    for op in ops:
48        if op in condition:
49            empty, number_str = condition.split( op )
50            try:
51                number = float( number_str )
52            except:
53                number = None
54            if empty != "" or not number:
55                print >> sys.stderr, "Invalid condition: %s, cannot filter." % condition
56                return
57            break
58
59    # Do filtering.
60    kept_features = 0
61    skipped_lines = 0
62    first_skipped_line = 0
63    out = open( output_name, 'w' )
64    for i, feature in enumerate( GFFReaderWrapper( open( input_name ) ) ):
65        if not isinstance( feature, GenomicInterval ):
66            continue
67        count = 0
68        for interval in feature.intervals:
69            if interval.feature == feature_name:
70                count += 1
71        if eval( '%s %s' % ( count, condition ) ):
72            # Keep feature.
73            for interval in feature.intervals:
74                out.write( "\t".join(interval.fields) + '\n' )
75            kept_features += 1
76
77    # Needed because i is 0-based but want to display stats using 1-based.
78    i += 1
79
80    # Clean up.
81    out.close()
82    info_msg = "%i of %i features kept (%.2f%%) using condition %s.  " % \
83        ( kept_features, i, float(kept_features)/i * 100.0, feature_name + condition )
84    if skipped_lines > 0:
85        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
86    print info_msg
87
88if __name__ == "__main__": __main__()