PageRenderTime 66ms CodeModel.GetById 38ms RepoModel.GetById 0ms app.codeStats 1ms

/gfam/scripts/find_domain_arch.py

https://github.com/ntamas/gfam
Python | 295 lines | 259 code | 23 blank | 13 comment | 18 complexity | 28c72a539810bfae991f4021ed763f13 MD5 | raw file
Possible License(s): GPL-3.0
  1. #!/usr/bin/env python
  2. """Application that calculates the domain architecture of each gene and
  3. outputs them in a simple text-based format.
  4. """
  5. from collections import defaultdict
  6. import operator
  7. import optparse
  8. import sys
  9. from gfam.assignment import AssignmentOverlapChecker, SequenceWithAssignments
  10. from gfam.interpro import InterPro, InterProNames
  11. from gfam.scripts import CommandLineApp
  12. from gfam.utils import complementerset, redirected
  13. __author__ = "Tamas Nepusz"
  14. __email__ = "tamas@cs.rhul.ac.uk"
  15. __copyright__ = "Copyright (c) 2010, Tamas Nepusz"
  16. __license__ = "GPL"
  17. class FindDomainArchitectureApp(CommandLineApp):
  18. """\
  19. Usage: %prog [options] interpro_file clustering_file
  20. Application that calculates the domain architecture of each gene and
  21. oputs them in a simple text-based format, given the filtered InterPro
  22. assignments and a clustering of the unknown regions in a separate file.
  23. """
  24. short_name = "find_domain_arch"
  25. def __init__(self, *args, **kwds):
  26. super(FindDomainArchitectureApp, self).__init__(*args, **kwds)
  27. self.seqcat = {}
  28. def create_parser(self):
  29. """Creates the command line parser for this application"""
  30. parser = super(FindDomainArchitectureApp, self).create_parser()
  31. parser.add_option("-s", "--min-size", dest="min_size",
  32. metavar="N",
  33. help="consider only clusters with at least N elements as novel domains",
  34. config_key="analysis:find_domain_arch/min_novel_domain_size",
  35. default=2, type=int)
  36. parser.add_option("-S", "--sequences",
  37. dest="sequences_file", metavar="FILE",
  38. help="FASTA file containing all the sequences of the representative gene model",
  39. config_key="file.input.sequences", default=None)
  40. parser.add_option("-i", "--interpro-parent-child-file",
  41. dest="interpro_parent_child_file",
  42. metavar="FILE",
  43. help="use the InterPro parent-child FILE to remap IDs",
  44. config_key="file.mapping.interpro_parent_child",
  45. default=None)
  46. parser.add_option("--details",
  47. dest="details", metavar="FILE",
  48. help="print more details about the domain architecture into FILE",
  49. config_key="generated/file.domain_architecture_details",
  50. default=None)
  51. parser.add_option("--stats",
  52. dest="stats", metavar="FILE",
  53. help="print genome-level statistics about the domain architectures into FILE",
  54. config_key="generated/file.domain_architecture_stats",
  55. default=None)
  56. parser.add_option("-n", "--names",
  57. dest="interpro_names_file",
  58. metavar="FILE",
  59. help="use the given FILE to assign InterPro IDs to names",
  60. config_key="file.mapping.interpro2name",
  61. default=None)
  62. parser.add_option("-r", "--seq-id-regexp", metavar="REGEXP",
  63. help="remap sequence IDs using REGEXP",
  64. config_key="sequence_id_regexp",
  65. dest="sequence_id_regexp")
  66. parser.add_option("--max-overlap", metavar="SIZE",
  67. help="sets the maximum overlap size allowed between "
  68. "assignments of the same data source. Default: %default",
  69. config_key="max_overlap",
  70. dest="max_overlap", type=int, default=20)
  71. return parser
  72. def run_real(self):
  73. """Runs the applications"""
  74. if len(self.args) != 2:
  75. self.error("exactly two input files are expected")
  76. AssignmentOverlapChecker.max_overlap = self.options.max_overlap
  77. if self.options.interpro_parent_child_file:
  78. self.log.info("Loading InterPro parent-child assignments from %s..." % \
  79. self.options.interpro_parent_child_file)
  80. self.interpro = InterPro.FromFile(self.options.interpro_parent_child_file)
  81. else:
  82. self.interpro = InterPro()
  83. self.interpro_names = InterProNames.FromFile(self.options.interpro_names_file)
  84. if self.options.details:
  85. self.details_file = open(self.options.details, "w")
  86. else:
  87. self.details_file = None
  88. interpro_file, clustering_file = self.args
  89. self.process_interpro_file(interpro_file)
  90. self.process_clustering_file(clustering_file)
  91. self.sort_by_domain_architecture()
  92. for seqs in self.domain_archs.itervalues():
  93. seqs.sort()
  94. self.domain_archs = self.domain_archs.items()
  95. self.domain_archs.sort(key=lambda x: len(x[1]), reverse=True)
  96. for domain_arch, members in self.domain_archs:
  97. if domain_arch:
  98. arch_str = ";".join(domain_arch)
  99. else:
  100. arch_str = "NO_ASSIGNMENT"
  101. arch_str_pos = "NO_ASSIGNMENT"
  102. arch_desc = "NO_DESCRIPTION"
  103. family_length = len(members)
  104. for member in members:
  105. seq = self.seqcat[member]
  106. if domain_arch:
  107. arch_str_pos = ";".join(assignment.short_repr() \
  108. for assignment in seq.assignments)
  109. arch_desc = ";".join( \
  110. self.interpro_names[assignment.domain]
  111. for assignment in seq.assignments
  112. )
  113. print "%s\t%d\t%s\t%d\t%s\t%s" % (member, seq.length, arch_str, \
  114. family_length, arch_str_pos, \
  115. arch_desc)
  116. self.details_file.close()
  117. if self.options.stats:
  118. stats_file = open(self.options.stats, "w")
  119. total_residues, covered_residues, covered_residues_nonnovel = 0.0, 0, 0
  120. nonnovel_sources = complementerset(["Novel"])
  121. for seq in self.seqcat.itervalues():
  122. total_residues += seq.length
  123. covered_residues += round(seq.coverage() * seq.length)
  124. covered_residues_nonnovel += round(seq.coverage(sources=nonnovel_sources) * seq.length)
  125. all_archs = set(arch for arch, _ in self.domain_archs)
  126. num_archs = len(all_archs)
  127. if "" in self.domain_archs:
  128. num_archs -= 1
  129. def exclude_novel_domains(domain_architecture):
  130. """Excludes novel domains from a domain architecture and returns
  131. the filtered domain architecture as a tuple."""
  132. return tuple(a for a in domain_architecture if not a.startswith("NOVEL"))
  133. archs_without_novel = set(exclude_novel_domains(arch)
  134. for arch in all_archs)
  135. archs_without_novel.discard(())
  136. num_archs_without_novel = len(archs_without_novel)
  137. num_seqs_with_nonempty_domain_arch = \
  138. sum(len(value) for key, value in self.domain_archs if key)
  139. num_seqs_with_nonempty_domain_arch_ignore_novel = \
  140. sum(len(value) for key, value in self.domain_archs
  141. if exclude_novel_domains(key) in archs_without_novel)
  142. num_seqs_with_nonempty_nonnovel_domain_arch = \
  143. sum(len(value) for key, value in self.domain_archs
  144. if key and not any(a.startswith("NOVEL") for a in key))
  145. with redirected(stdout=stats_file):
  146. print "Domain architectures"
  147. print "===================="
  148. print ""
  149. print "Non-empty: %d" % num_archs
  150. print "Non-empty (when ignoring novel domains): %d" % num_archs_without_novel
  151. print ""
  152. print "Sequences"
  153. print "========="
  154. print ""
  155. print "Total: %d" % len(self.seqcat)
  156. print "With at least one domain: %d (%.4f%%)" %\
  157. (num_seqs_with_nonempty_domain_arch,
  158. 100.0 * num_seqs_with_nonempty_domain_arch / len(self.seqcat))
  159. print "With at least one non-novel domain: %d (%.4f%%)" %\
  160. (num_seqs_with_nonempty_domain_arch_ignore_novel,
  161. 100.0 * num_seqs_with_nonempty_domain_arch_ignore_novel / len(self.seqcat))
  162. print "With at least one domain and no novel domains: %d (%.4f%%)" %\
  163. (num_seqs_with_nonempty_nonnovel_domain_arch,
  164. 100.0 * num_seqs_with_nonempty_nonnovel_domain_arch / len(self.seqcat))
  165. print ""
  166. print "Residues"
  167. print "========"
  168. print ""
  169. print "Total: %d" % total_residues
  170. print "Covered: %d (%.4f%%)" % (covered_residues, 100.0*covered_residues/total_residues)
  171. print "Covered by non-novel: %d (%.4f%%)" % (covered_residues_nonnovel, 100.0*covered_residues_nonnovel/total_residues)
  172. stats_file.close()
  173. def process_interpro_file(self, interpro_file):
  174. from gfam.scripts.find_unassigned import FindUnassignedApp
  175. unassigned_app = FindUnassignedApp()
  176. unassigned_app.set_sequence_id_regexp(self.options.sequence_id_regexp)
  177. unassigned_app.process_sequences_file(self.options.sequences_file)
  178. unassigned_app.process_infile(interpro_file)
  179. self.seqcat = unassigned_app.seqcat
  180. for seq_id in set(unassigned_app.seq_ids_to_length.keys()) - set(self.seqcat.keys()):
  181. self.seqcat[seq_id] = SequenceWithAssignments(seq_id, \
  182. unassigned_app.seq_ids_to_length[seq_id])
  183. def process_clustering_file(self, fname):
  184. f = open(fname)
  185. idx = 1
  186. for line in f:
  187. ids = line.strip().split()
  188. if len(ids) < self.options.min_size:
  189. continue
  190. domain_name = "NOVEL%05d" % idx
  191. idx += 1
  192. for id in ids:
  193. seq_id, _, limits = id.rpartition(":")
  194. start, end = map(int, limits.split("-"))
  195. self.seqcat[seq_id].assign_(start, end, domain_name)
  196. f.close()
  197. def sort_by_domain_architecture(self):
  198. self.domain_archs = defaultdict(list)
  199. for seq_id, seq in self.seqcat.iteritems():
  200. assignments = sorted(seq.assignments, key=operator.attrgetter("start"))
  201. domains = []
  202. if self.details_file:
  203. print >>self.details_file, seq_id
  204. primary_source = set()
  205. new_assignments = []
  206. for assignment in assignments:
  207. new_assignment = assignment.resolve_interpro_ids(self.interpro)
  208. if assignment.comment == "1":
  209. primary_source.add(assignment.source)
  210. domains.append(new_assignment.domain)
  211. new_assignments.append(new_assignment)
  212. self.domain_archs[tuple(domains)].append(seq_id)
  213. if not primary_source:
  214. primary_source = None
  215. else:
  216. primary_source = ", ".join(primary_source)
  217. if self.details_file:
  218. seq2 = SequenceWithAssignments(seq.name, seq.length)
  219. seq2.assignments = [assignment for assignment in assignments \
  220. if assignment.source != "Novel"]
  221. sources = sorted(set(assignment.source \
  222. for assignment in assignments \
  223. if assignment.source != "Novel"))
  224. print >>self.details_file, " Primary assignment source:", primary_source
  225. print >>self.details_file, " Number of data sources used:", len(sources)
  226. print >>self.details_file, " Data sources: %s" % ", ".join(sources)
  227. print >>self.details_file, " Coverage: %.3f" % seq.coverage()
  228. print >>self.details_file, " Coverage w/o novel domains: %.3f" % seq2.coverage()
  229. for assignment in assignments:
  230. attrs = assignment._asdict()
  231. if assignment.comment is None and \
  232. assignment.domain.startswith("NOVEL"):
  233. attrs["comment"] = "novel"
  234. row = " %(start)4d-%(end)4d: %(domain)s "\
  235. "(%(source)s, stage: %(comment)s)" % attrs
  236. print >>self.details_file, row,
  237. interpro_id = assignment.interpro_id
  238. if not interpro_id and assignment.domain in self.interpro.mapping:
  239. interpro_id = self.interpro.mapping[assignment.domain]
  240. if interpro_id:
  241. anc = self.interpro.tree.get_most_remote_ancestor(interpro_id)
  242. if interpro_id == anc:
  243. print >>self.details_file, "(InterPro ID: %s)" % anc
  244. else:
  245. print >>self.details_file, "(InterPro ID: %s --> %s)" % (interpro_id, anc)
  246. if anc in self.interpro_names:
  247. print >>self.details_file, " "*(row.index(":")+1), self.interpro_names[anc]
  248. else:
  249. print >>self.details_file, ""
  250. if assignment.domain in self.interpro_names:
  251. print >>self.details_file, " "*(row.index(":")+1), self.interpro_names[assignment.domain]
  252. print >>self.details_file, ""
  253. seq.assignments = new_assignments
  254. if __name__ == "__main__":
  255. sys.exit(FindDomainArchitectureApp().run())