/tools/taxonomy/find_diag_hits.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 170 lines · 149 code · 4 blank · 17 comment · 6 complexity · 90e16dbaa2f441bf8e6547899aff00a6 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file>
  4. finds reads that only hit one taxonomic group. For example, consider the folliowing example:
  5. read1 mammalia
  6. read1 insecta
  7. read2 insecta
  8. in this case only read2 will be selected becuase it stays within insecta
  9. This program takes the following options:
  10. file in taxonomy format - dataset that complies with Galaxy's taxonomy format
  11. id column - integer specifying the number of column containing seq id (starting with 1)
  12. taxonomic ranks - a comma separated list of ranks from this list:
  13. superkingdom
  14. kingdom
  15. subkingdom
  16. superphylum
  17. phylum
  18. subphylum
  19. superclass
  20. class
  21. subclass
  22. superorder
  23. order
  24. suborder
  25. superfamily
  26. family
  27. subfamily
  28. tribe
  29. subtribe
  30. genus
  31. subgenus
  32. species
  33. subspecies
  34. output format - reads or counts
  35. """
  36. from galaxy import eggs
  37. import pkg_resources
  38. pkg_resources.require( 'pysqlite' )
  39. from pysqlite2 import dbapi2 as sqlite
  40. import string, sys, tempfile
  41. # This dictionary maps taxonomic ranks to fields of Taxonomy file
  42. taxRank = {
  43. 'root' :2,
  44. 'superkingdom':3,
  45. 'kingdom' :4,
  46. 'subkingdom' :5,
  47. 'superphylum' :6,
  48. 'phylum' :7,
  49. 'subphylum' :8,
  50. 'superclass' :9,
  51. 'class' :10,
  52. 'subclass' :11,
  53. 'superorder' :12,
  54. 'ord' :13,
  55. 'suborder' :14,
  56. 'superfamily' :15,
  57. 'family' :16,
  58. 'subfamily' :17,
  59. 'tribe' :18,
  60. 'subtribe' :19,
  61. 'genus' :20,
  62. 'subgenus' :21,
  63. 'species' :22,
  64. 'subspecies' :23,
  65. 'order' :13
  66. }
  67. def stop_err(msg):
  68. sys.stderr.write(msg)
  69. sys.exit()
  70. db = tempfile.NamedTemporaryFile('w')
  71. try:
  72. con = sqlite.connect(db.name)
  73. cur = con.cursor()
  74. except:
  75. stop_err('Cannot connect to %s\n') % db.name
  76. try:
  77. tax_file = open(sys.argv[1], 'r')
  78. id_col = int(sys.argv[2]) - 1
  79. taxa = string.split(sys.argv[3].rstrip(),',')
  80. if sys.argv[4] == 'reads':
  81. out_format = True
  82. elif sys.argv[4] == 'counts':
  83. out_format = False
  84. else:
  85. stop_err('Please specify "reads" or "counts" for output format\n')
  86. out_file = open(sys.argv[5], 'w')
  87. except:
  88. stop_err('Check arguments\n')
  89. if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n')
  90. sql = ""
  91. for i in range(len(taxa)):
  92. if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order'
  93. sql += '%s text, ' % taxa[i]
  94. sql = sql.strip(', ')
  95. sql = 'create table tax (name varchar(50) not null, ' + sql + ')'
  96. cur.execute(sql)
  97. invalid_line_number = 0
  98. try:
  99. for line in tax_file:
  100. fields = string.split(line.rstrip(), '\t')
  101. if len(fields) < 24:
  102. invalid_line_number += 1
  103. continue # Skipping malformed taxonomy lines
  104. val_string = '"' + fields[id_col] + '", '
  105. for rank in taxa:
  106. taxon = fields[taxRank[rank]]
  107. val_string += '"%s", ' % taxon
  108. val_string = val_string.strip(', ')
  109. val_string = "insert into tax values(" + val_string + ")"
  110. cur.execute(val_string)
  111. except Exception, e:
  112. stop_err('%s\n' % e)
  113. tax_file.close()
  114. try:
  115. for rank in taxa:
  116. cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank )
  117. cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) )
  118. cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank)
  119. cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) )
  120. if rank == 'ord':
  121. rankName = 'order'
  122. else:
  123. rankName = rank
  124. if out_format:
  125. cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank)
  126. for item in cur.fetchall():
  127. out_string = '%s\t%s\t' % ( item[0], item[1] )
  128. out_string += rankName
  129. print >>out_file, out_string
  130. else:
  131. cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank)
  132. for item in cur.fetchall():
  133. out_string = '%s\t%s\t' % ( item[0], item[1] )
  134. out_string += rankName
  135. print >>out_file, out_string
  136. except Exception, e:
  137. stop_err("%s\n" % e)