/Source/Framework/Add-ins/Bio.Pamsam/Algorithms/Alignment/MultipleSequenceAlignment/HierarchicalClustering.cs

# · C# · 205 lines · 89 code · 21 blank · 95 comment · 2 complexity · 6f07a3a44d65a43a44930d225d4ce195 MD5 · raw file

  1. // *********************************************************
  2. //
  3. // Copyright (c) Microsoft. All rights reserved.
  4. // This code is licensed under the Apache License, Version 2.0.
  5. // THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
  6. // ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
  7. // IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
  8. // PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
  9. //
  10. // *********************************************************
  11. using System.Collections.Generic;
  12. using System;
  13. namespace Bio.Algorithms.Alignment.MultipleSequenceAlignment
  14. {
  15. /// <summary>
  16. /// Hierarchically clusters sequences based on distance matrix.
  17. ///
  18. /// Steps:
  19. /// 1) Initially all sequences are clusters themselves.
  20. /// 2) Iteratively: identify the cloesest pair of sequences (clusters) in the distance matrix,
  21. /// cluster them into one cluster, and update distances of this cluster with the rest clusters.
  22. /// 3) Terminate when only one cluster left.
  23. ///
  24. /// A binary guide tree is then generated:
  25. /// the root of the tree is the final cluster; leaves are sequence clusters.
  26. /// From bottom up, the nodes order represents the clustering order,
  27. /// and it's kept in a node list.
  28. /// The progressive aligner will then follow this order to align the set of sequences.
  29. /// </summary>
  30. public abstract class HierarchicalClustering : IHierarchicalClustering
  31. {
  32. #region Fields
  33. /// <summary>
  34. /// The node list in the generated binary tree
  35. /// </summary>
  36. protected List<BinaryGuideTreeNode> _nodes = null;
  37. /// <summary>
  38. /// The edge list
  39. /// </summary>
  40. protected List<BinaryGuideTreeEdge> _edges = null;
  41. /// <summary>
  42. /// Delegate function for updating distances
  43. /// </summary>
  44. protected UpdateDistanceMethodSelector _updateDistanceMethod;
  45. /// <summary>
  46. /// The list stores the current clusters generated
  47. /// </summary>
  48. protected List<int> _clusters = null;
  49. /// <summary>
  50. /// Record the index of cluster using the index in the distance matrix
  51. /// </summary>
  52. protected int[] _indexToCluster = null;
  53. #endregion
  54. #region Properties
  55. /// <summary>
  56. /// The node list of this class
  57. /// </summary>
  58. virtual public List<BinaryGuideTreeNode> Nodes
  59. {
  60. get { return _nodes; }
  61. }
  62. /// <summary>
  63. /// The edge list of this class
  64. /// </summary>
  65. virtual public List<BinaryGuideTreeEdge> Edges
  66. {
  67. get { return _edges; }
  68. }
  69. #endregion
  70. #region Constructors
  71. /// <summary>
  72. /// Construct clusters based on distance matrix.
  73. /// The default distance update method is 'average'
  74. /// </summary>
  75. /// <param name="distanceMatrix">IDistanceMatrix</param>
  76. public HierarchicalClustering(IDistanceMatrix distanceMatrix)
  77. : this(distanceMatrix, UpdateDistanceMethodsTypes.Single)
  78. {
  79. }
  80. /// <summary>
  81. /// Construct clusters using different update methods
  82. /// </summary>
  83. /// <param name="distanceMatrix">IDistanceMatrix</param>
  84. /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param>
  85. public HierarchicalClustering(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName)
  86. {
  87. if (distanceMatrix.Dimension <= 0)
  88. {
  89. throw new Exception("Invalid distance matrix dimension");
  90. }
  91. try
  92. {
  93. // The number of nodes in the final tree is 2N-2:
  94. // N sequence nodes (leaves) and N-2 internal nodes
  95. // where N is the number of input sequences
  96. _nodes = new List<BinaryGuideTreeNode>(distanceMatrix.Dimension * 2 - 1);
  97. _edges = new List<BinaryGuideTreeEdge>(distanceMatrix.Dimension * 2 - 2);
  98. // The number of clusters is the number of leaves at the beginning
  99. // As the algorithm merges clusters, only one cluster remains.
  100. _clusters = new List<int>(distanceMatrix.Dimension);
  101. // Construct _indexToCluster
  102. _indexToCluster = new int[distanceMatrix.Dimension];
  103. for (int i = 0; i < distanceMatrix.Dimension; ++i)
  104. {
  105. _indexToCluster[i] = i;
  106. }
  107. }
  108. catch (OutOfMemoryException ex)
  109. {
  110. throw new Exception("Out of memory", ex.InnerException);
  111. }
  112. // Choose a update-distance method
  113. switch(updateDistanceMethodName)
  114. {
  115. case(UpdateDistanceMethodsTypes.Average):
  116. _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateAverage);
  117. break;
  118. case(UpdateDistanceMethodsTypes.Single):
  119. _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateSingle);
  120. break;
  121. case(UpdateDistanceMethodsTypes.Complete):
  122. _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateComplete);
  123. break;
  124. case(UpdateDistanceMethodsTypes.WeightedMAFFT):
  125. _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateWeightedMAFFT);
  126. break;
  127. default:
  128. throw new Exception("invalid update method");
  129. }
  130. }
  131. #endregion
  132. #region Update cluster methods
  133. // Check out enum UpdateDistanceMethodsTypes for details
  134. /// <summary>
  135. /// arithmetic average of distance[nextA,other] and distance[nextB,other]
  136. /// </summary>
  137. /// <param name="distanceMatrix">distance matrix for the cluster</param>
  138. /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
  139. /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
  140. /// <param name="other">the other cluster whose distance will be updated</param>
  141. protected float UpdateAverage(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
  142. {
  143. return (distanceMatrix[nextA, other] + distanceMatrix[nextB, other]) / 2;
  144. }
  145. /// <summary>
  146. /// Minimum of distance[nextA,other] and distance[nextB,other]
  147. /// </summary>
  148. /// <param name="distanceMatrix">distance matrix for the cluster</param>
  149. /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
  150. /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
  151. /// <param name="other">the other cluster whose distance will be updated</param>
  152. protected float UpdateSingle(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
  153. {
  154. return Math.Min(distanceMatrix[nextA, other], distanceMatrix[nextB, other]);
  155. }
  156. /// <summary>
  157. /// Maximum of distance[nextA,other] and distance[nextB,other]
  158. /// </summary>
  159. /// <param name="distanceMatrix">distance matrix for the cluster</param>
  160. /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
  161. /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
  162. /// <param name="other">the other cluster whose distance will be updated</param>
  163. protected float UpdateComplete(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
  164. {
  165. return Math.Max(distanceMatrix[nextA, other], distanceMatrix[nextB, other]);
  166. }
  167. /// <summary>
  168. /// Adapted from MAFFT software:
  169. /// weighted mixture of minimum and average linkage
  170. /// d = (1-s)*d_min + s*d_avg
  171. /// where s is 0.1 by default
  172. /// </summary>
  173. /// <param name="distanceMatrix">distance matrix for the cluster</param>
  174. /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
  175. /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
  176. /// <param name="other">the other cluster whose distance will be updated</param>
  177. protected float UpdateWeightedMAFFT(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
  178. {
  179. return (float)(0.9*Math.Min(distanceMatrix[nextA, other], distanceMatrix[nextB, other])
  180. + 0.1*(distanceMatrix[nextA, other] + distanceMatrix[nextB, other]) / 2);
  181. }
  182. #endregion
  183. }
  184. }