/SourcePackages/forester-atv/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java

https://github.com/stamatak/EPA-WorkBench · Java · 365 lines · 337 code · 26 blank · 2 comment · 71 complexity · 58dcaff3bcf2dceb70cfd0ced157e65c MD5 · raw file

  1. package org.forester.surfacing;
  2. import java.text.DecimalFormat;
  3. import java.text.NumberFormat;
  4. import java.util.ArrayList;
  5. import java.util.Collections;
  6. import java.util.Comparator;
  7. import java.util.HashMap;
  8. import java.util.HashSet;
  9. import java.util.List;
  10. import java.util.Map;
  11. import java.util.Set;
  12. import java.util.SortedMap;
  13. import java.util.SortedSet;
  14. import java.util.TreeMap;
  15. import java.util.TreeSet;
  16. import org.forester.go.GoId;
  17. import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType;
  18. import org.forester.util.BasicDescriptiveStatistics;
  19. import org.forester.util.DescriptiveStatistics;
  20. import org.forester.util.ForesterUtil;
  21. public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDomains {
  22. private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" );
  23. private static final Comparator<CombinableDomains> DESCENDING_KEY_DOMAIN_COUNT_ORDER = new Comparator<CombinableDomains>() {
  24. public int compare( final CombinableDomains d1,
  25. final CombinableDomains d2 ) {
  26. if ( d1
  27. .getKeyDomainCount() < d2
  28. .getKeyDomainCount() ) {
  29. return 1;
  30. }
  31. else if ( d1
  32. .getKeyDomainCount() > d2
  33. .getKeyDomainCount() ) {
  34. return -1;
  35. }
  36. else {
  37. return d1
  38. .getKeyDomain()
  39. .getId()
  40. .compareTo( d2
  41. .getKeyDomain()
  42. .getId() );
  43. }
  44. }
  45. };
  46. private static final Comparator<CombinableDomains> DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER = new Comparator<CombinableDomains>() {
  47. public int compare( final CombinableDomains d1,
  48. final CombinableDomains d2 ) {
  49. if ( d1
  50. .getKeyDomainProteinsCount() < d2
  51. .getKeyDomainProteinsCount() ) {
  52. return 1;
  53. }
  54. else if ( d1
  55. .getKeyDomainProteinsCount() > d2
  56. .getKeyDomainProteinsCount() ) {
  57. return -1;
  58. }
  59. else {
  60. return d1
  61. .getKeyDomain()
  62. .getId()
  63. .compareTo( d2
  64. .getKeyDomain()
  65. .getId() );
  66. }
  67. }
  68. };
  69. private static final Comparator<CombinableDomains> DESCENDING_COMBINATIONS_COUNT_ORDER = new Comparator<CombinableDomains>() {
  70. public int compare( final CombinableDomains d1,
  71. final CombinableDomains d2 ) {
  72. if ( d1
  73. .getNumberOfCombinableDomains() < d2
  74. .getNumberOfCombinableDomains() ) {
  75. return 1;
  76. }
  77. else if ( d1
  78. .getNumberOfCombinableDomains() > d2
  79. .getNumberOfCombinableDomains() ) {
  80. return -1;
  81. }
  82. else {
  83. return d1
  84. .getKeyDomain()
  85. .getId()
  86. .compareTo( d2
  87. .getKeyDomain()
  88. .getId() );
  89. }
  90. }
  91. };
  92. final private SortedMap<DomainId, CombinableDomains> _combinable_domains_map;
  93. final private Species _species;
  94. final private DomainCombinationType _dc_type;
  95. private BasicGenomeWideCombinableDomains( final Species species, final DomainCombinationType dc_type ) {
  96. _combinable_domains_map = new TreeMap<DomainId, CombinableDomains>();
  97. _species = species;
  98. _dc_type = dc_type;
  99. }
  100. private void add( final DomainId key, final CombinableDomains cdc ) {
  101. _combinable_domains_map.put( key, cdc );
  102. }
  103. public boolean contains( final DomainId key_id ) {
  104. return _combinable_domains_map.containsKey( key_id );
  105. }
  106. public CombinableDomains get( final DomainId key_id ) {
  107. return _combinable_domains_map.get( key_id );
  108. }
  109. public SortedMap<DomainId, CombinableDomains> getAllCombinableDomainsIds() {
  110. return _combinable_domains_map;
  111. }
  112. @Override
  113. public SortedSet<DomainId> getAllDomainIds() {
  114. final SortedSet<DomainId> domains = new TreeSet<DomainId>();
  115. for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
  116. final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
  117. final List<DomainId> ds = cb.getAllDomains();
  118. for( final DomainId d : ds ) {
  119. domains.add( d );
  120. }
  121. }
  122. return domains;
  123. }
  124. @Override
  125. public DomainCombinationType getDomainCombinationType() {
  126. return _dc_type;
  127. }
  128. @Override
  129. public SortedSet<DomainId> getMostPromiscuosDomain() {
  130. final SortedSet<DomainId> doms = new TreeSet<DomainId>();
  131. final int max = ( int ) getPerGenomeDomainPromiscuityStatistics().getMax();
  132. for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
  133. final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
  134. if ( cb.getNumberOfCombinableDomains() == max ) {
  135. doms.add( key );
  136. }
  137. }
  138. return doms;
  139. }
  140. @Override
  141. public DescriptiveStatistics getPerGenomeDomainPromiscuityStatistics() {
  142. final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
  143. for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
  144. final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
  145. stats.addValue( cb.getNumberOfCombinableDomains() );
  146. }
  147. return stats;
  148. }
  149. public int getSize() {
  150. return _combinable_domains_map.size();
  151. }
  152. public Species getSpecies() {
  153. return _species;
  154. }
  155. @Override
  156. public SortedSet<BinaryDomainCombination> toBinaryDomainCombinations() {
  157. final SortedSet<BinaryDomainCombination> binary_combinations = new TreeSet<BinaryDomainCombination>();
  158. for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
  159. final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
  160. for( final BinaryDomainCombination b : cb.toBinaryDomainCombinations() ) {
  161. binary_combinations.add( b );
  162. }
  163. }
  164. return binary_combinations;
  165. }
  166. @Override
  167. public String toString() {
  168. return toStringBuilder( GenomeWideCombinableDomainsSortOrder.ALPHABETICAL_KEY_ID ).toString();
  169. }
  170. // Produces something like:
  171. // 2-oxoacid_dh 5 5 2 4.8E-67 Biotin_lipoyl [4], E3_binding [3]
  172. public StringBuilder toStringBuilder( final GenomeWideCombinableDomainsSortOrder sort_order ) {
  173. final StringBuilder sb = new StringBuilder();
  174. final List<CombinableDomains> combinable_domains = new ArrayList<CombinableDomains>();
  175. for( final DomainId key : getAllCombinableDomainsIds().keySet() ) {
  176. final CombinableDomains cb = getAllCombinableDomainsIds().get( key );
  177. combinable_domains.add( cb );
  178. }
  179. if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_COUNT ) {
  180. Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_COUNT_ORDER );
  181. }
  182. else if ( sort_order == GenomeWideCombinableDomainsSortOrder.KEY_DOMAIN_PROTEINS_COUNT ) {
  183. Collections.sort( combinable_domains,
  184. BasicGenomeWideCombinableDomains.DESCENDING_KEY_DOMAIN_PROTEINS_COUNT_ORDER );
  185. }
  186. else if ( sort_order == GenomeWideCombinableDomainsSortOrder.COMBINATIONS_COUNT ) {
  187. Collections.sort( combinable_domains, BasicGenomeWideCombinableDomains.DESCENDING_COMBINATIONS_COUNT_ORDER );
  188. }
  189. for( final CombinableDomains cb : combinable_domains ) {
  190. sb.append( ForesterUtil.pad( new StringBuffer( cb.getKeyDomain().toString() ), 18, ' ', false ) );
  191. sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainCount() ), 8, ' ', false ) );
  192. sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getKeyDomainProteinsCount() ), 8, ' ', false ) );
  193. sb.append( ForesterUtil.pad( new StringBuffer( "" + cb.getNumberOfCombinableDomains() ), 8, ' ', false ) );
  194. sb
  195. .append( ForesterUtil
  196. .pad( new StringBuffer( ""
  197. + FORMATTER
  198. .format( cb.getKeyDomainConfidenceDescriptiveStatistics().median() ) ),
  199. 10,
  200. ' ',
  201. false ) );
  202. sb.append( cb.getCombiningDomainIdsAsStringBuilder() );
  203. sb.append( ForesterUtil.getLineSeparator() );
  204. }
  205. return sb;
  206. }
  207. private static void countDomains( final Map<DomainId, Integer> domain_counts,
  208. final Map<DomainId, Integer> domain_protein_counts,
  209. final Map<DomainId, DescriptiveStatistics> stats,
  210. final Set<DomainId> saw_c,
  211. final DomainId id_i,
  212. final double support ) {
  213. if ( domain_counts.containsKey( id_i ) ) {
  214. domain_counts.put( id_i, 1 + domain_counts.get( ( id_i ) ) );
  215. if ( !saw_c.contains( id_i ) ) {
  216. domain_protein_counts.put( id_i, 1 + domain_protein_counts.get( ( id_i ) ) );
  217. }
  218. }
  219. else {
  220. stats.put( id_i, new BasicDescriptiveStatistics() );
  221. domain_counts.put( id_i, 1 );
  222. domain_protein_counts.put( id_i, 1 );
  223. }
  224. stats.get( id_i ).addValue( support );
  225. saw_c.add( id_i );
  226. }
  227. public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
  228. final boolean ignore_combination_with_same_domain,
  229. final Species species ) {
  230. return createInstance( protein_list,
  231. ignore_combination_with_same_domain,
  232. species,
  233. null,
  234. DomainCombinationType.BASIC );
  235. }
  236. public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
  237. final boolean ignore_combination_with_same_domain,
  238. final Species species,
  239. final DomainCombinationType dc_type ) {
  240. return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type );
  241. }
  242. public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
  243. final boolean ignore_combination_with_same_domain,
  244. final Species species,
  245. final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
  246. final DomainCombinationType dc_type ) {
  247. final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type );
  248. final Map<DomainId, Integer> domain_counts = new HashMap<DomainId, Integer>();
  249. final Map<DomainId, Integer> domain_protein_counts = new HashMap<DomainId, Integer>();
  250. final Map<DomainId, DescriptiveStatistics> stats = new HashMap<DomainId, DescriptiveStatistics>();
  251. for( final Protein protein : protein_list ) {
  252. if ( !protein.getSpecies().equals( species ) ) {
  253. throw new IllegalArgumentException( "species (" + protein.getSpecies()
  254. + ") does not match species of combinable domains collection (" + species + ")" );
  255. }
  256. final Set<DomainId> saw_i = new HashSet<DomainId>();
  257. final Set<DomainId> saw_c = new HashSet<DomainId>();
  258. for( int i = 0; i < protein.getProteinDomains().size(); ++i ) {
  259. final Domain pd_i = protein.getProteinDomain( i );
  260. final DomainId id_i = pd_i.getDomainId();
  261. final int current_start = pd_i.getFrom();
  262. BasicGenomeWideCombinableDomains.countDomains( domain_counts,
  263. domain_protein_counts,
  264. stats,
  265. saw_c,
  266. id_i,
  267. pd_i.getPerSequenceEvalue() );
  268. if ( !saw_i.contains( id_i ) ) {
  269. if ( dc_type == DomainCombinationType.BASIC ) {
  270. saw_i.add( id_i );
  271. }
  272. CombinableDomains domain_combination = null;
  273. if ( instance.contains( id_i ) ) {
  274. domain_combination = instance.get( id_i );
  275. }
  276. else {
  277. if ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) {
  278. domain_combination = new AdjactantDirectedCombinableDomains( pd_i.getDomainId(), species );
  279. }
  280. else if ( dc_type == DomainCombinationType.DIRECTED ) {
  281. domain_combination = new DirectedCombinableDomains( pd_i.getDomainId(), species );
  282. }
  283. else {
  284. domain_combination = new BasicCombinableDomains( pd_i.getDomainId(), species );
  285. }
  286. if ( ( domain_id_to_go_ids_map != null )
  287. && domain_id_to_go_ids_map.containsKey( pd_i.getDomainId() ) ) {
  288. final List<GoId> go_ids = domain_id_to_go_ids_map.get( pd_i.getDomainId() );
  289. for( final GoId go_id : go_ids ) {
  290. domain_combination.getKeyDomain().addGoId( go_id );
  291. }
  292. }
  293. instance.add( id_i, domain_combination );
  294. }
  295. final Set<DomainId> saw_j = new HashSet<DomainId>();
  296. if ( ignore_combination_with_same_domain ) {
  297. saw_j.add( id_i );
  298. }
  299. Domain closest = null;
  300. for( int j = 0; j < protein.getNumberOfProteinDomains(); ++j ) {
  301. if ( ( dc_type != DomainCombinationType.BASIC )
  302. && ( current_start >= protein.getProteinDomain( j ).getFrom() ) ) {
  303. continue;
  304. }
  305. if ( i != j ) {
  306. final DomainId id = protein.getProteinDomain( j ).getDomainId();
  307. if ( !saw_j.contains( id ) ) {
  308. saw_j.add( id );
  309. if ( dc_type != DomainCombinationType.DIRECTED_ADJACTANT ) {
  310. domain_combination
  311. .addCombinableDomain( protein.getProteinDomain( j ).getDomainId() );
  312. }
  313. else {
  314. if ( closest == null ) {
  315. closest = protein.getProteinDomain( j );
  316. }
  317. else {
  318. if ( protein.getProteinDomain( j ).getFrom() < closest.getFrom() ) {
  319. closest = protein.getProteinDomain( j );
  320. }
  321. }
  322. }
  323. }
  324. }
  325. }
  326. if ( ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) && ( closest != null ) ) {
  327. domain_combination.addCombinableDomain( closest.getDomainId() );
  328. }
  329. }
  330. }
  331. }
  332. for( final DomainId key_id : domain_counts.keySet() ) {
  333. instance.get( key_id ).setKeyDomainCount( domain_counts.get( key_id ) );
  334. instance.get( key_id ).setKeyDomainProteinsCount( domain_protein_counts.get( key_id ) );
  335. instance.get( key_id ).setKeyDomainConfidenceDescriptiveStatistics( stats.get( key_id ) );
  336. }
  337. return instance;
  338. }
  339. }