/tools/filters/gff/gtf_filter_by_attribute_values_list.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 67 lines · 42 code · 6 blank · 19 comment · 12 complexity · 9f07fa61e1d688b6f7f093fc7609c20c MD5 · raw file

  1. #
  2. # Filters a GFF file using a list of attribute values. Attribute values must
  3. # be in the first column of the file; subsequent columns are ignored.
  4. # Usage:
  5. # python gff_filter_by_attribute_values.py <gff_file> <attribute_name> <ids_file> <output_file>
  6. #
  7. import sys
  8. def parse_gff_attributes( attr_str ):
  9. """
  10. Parses a GFF/GTF attribute string and returns a dictionary of name-value
  11. pairs. The general format for a GFF3 attributes string is
  12. name1=value1;name2=value2
  13. The general format for a GTF attribute string is
  14. name1 "value1" ; name2 "value2"
  15. The general format for a GFF attribute string is a single string that
  16. denotes the interval's group; in this case, method returns a dictionary
  17. with a single key-value pair, and key name is 'group'
  18. """
  19. attributes_list = attr_str.split(";")
  20. attributes = {}
  21. for name_value_pair in attributes_list:
  22. # Try splitting by space and, if necessary, by '=' sign.
  23. pair = name_value_pair.strip().split(" ")
  24. if len( pair ) == 1:
  25. pair = name_value_pair.strip().split("=")
  26. if len( pair ) == 1:
  27. # Could not split for some reason -- raise exception?
  28. continue
  29. if pair == '':
  30. continue
  31. name = pair[0].strip()
  32. if name == '':
  33. continue
  34. # Need to strip double quote from values
  35. value = pair[1].strip(" \"")
  36. attributes[ name ] = value
  37. if len( attributes ) == 0:
  38. # Could not split attributes string, so entire string must be
  39. # 'group' attribute. This is the case for strictly GFF files.
  40. attributes['group'] = attr_str
  41. return attributes
  42. def filter( gff_file, attribute_name, ids_file, output_file ):
  43. # Put ids in dict for quick lookup.
  44. ids_dict = {}
  45. for line in open( ids_file ):
  46. ids_dict[ line.split('\t')[0].strip() ] = True
  47. # Filter GFF file using ids.
  48. output = open( output_file, 'w' )
  49. for line in open( gff_file ):
  50. fields = line.split( '\t' )
  51. attributes = parse_gff_attributes( fields[8] )
  52. if ( attribute_name in attributes ) and ( attributes[ attribute_name ] in ids_dict ):
  53. output.write( line )
  54. output.close()
  55. if __name__ == "__main__":
  56. # Handle args.
  57. if len( sys.argv ) != 5:
  58. print >> sys.stderr, "usage: python %s <gff_file> <attribute_name> <ids_file> <output_file>" % sys.argv[0]
  59. sys.exit( -1 )
  60. gff_file, attribute_name, ids_file, output_file = sys.argv[1:]
  61. filter( gff_file, attribute_name, ids_file, output_file )