PageRenderTime 25ms CodeModel.GetById 14ms app.highlight 9ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/filters/gff/gtf_filter_by_attribute_values_list.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 67 lines | 54 code | 2 blank | 11 comment | 0 complexity | 9f07fa61e1d688b6f7f093fc7609c20c MD5 | raw file
 1#
 2# Filters a GFF file using a list of attribute values. Attribute values must 
 3# be in the first column of the file; subsequent columns are ignored.
 4# Usage:
 5# python gff_filter_by_attribute_values.py <gff_file> <attribute_name> <ids_file> <output_file>
 6#
 7
 8import sys
 9
10def parse_gff_attributes( attr_str ):
11    """
12    Parses a GFF/GTF attribute string and returns a dictionary of name-value 
13    pairs. The general format for a GFF3 attributes string is 
14        name1=value1;name2=value2
15    The general format for a GTF attribute string is 
16        name1 "value1" ; name2 "value2"
17    The general format for a GFF attribute string is a single string that
18    denotes the interval's group; in this case, method returns a dictionary 
19    with a single key-value pair, and key name is 'group'
20    """    
21    attributes_list = attr_str.split(";")
22    attributes = {}
23    for name_value_pair in attributes_list:
24        # Try splitting by space and, if necessary, by '=' sign.
25        pair = name_value_pair.strip().split(" ")
26        if len( pair ) == 1:
27            pair = name_value_pair.strip().split("=")
28        if len( pair ) == 1:
29            # Could not split for some reason -- raise exception?
30            continue
31        if pair == '':
32            continue
33        name = pair[0].strip()
34        if name == '':
35            continue
36        # Need to strip double quote from values
37        value = pair[1].strip(" \"")
38        attributes[ name ] = value
39        
40    if len( attributes ) == 0:
41        # Could not split attributes string, so entire string must be 
42        # 'group' attribute. This is the case for strictly GFF files.
43        attributes['group'] = attr_str
44    return attributes
45
46def filter( gff_file, attribute_name, ids_file, output_file ):
47    # Put ids in dict for quick lookup.
48    ids_dict = {}
49    for line in open( ids_file ):
50        ids_dict[ line.split('\t')[0].strip() ] = True
51
52    # Filter GFF file using ids.
53    output = open( output_file, 'w' )
54    for line in open( gff_file ):
55        fields = line.split( '\t' )
56        attributes = parse_gff_attributes( fields[8] )
57        if ( attribute_name in attributes ) and ( attributes[ attribute_name ] in ids_dict ):
58            output.write( line )
59    output.close()
60        
61if __name__ == "__main__":
62    # Handle args.
63    if len( sys.argv ) != 5:
64        print >> sys.stderr, "usage: python %s <gff_file> <attribute_name> <ids_file> <output_file>"  % sys.argv[0]
65        sys.exit( -1 )
66    gff_file, attribute_name, ids_file, output_file = sys.argv[1:]
67    filter( gff_file, attribute_name, ids_file, output_file )