PageRenderTime 26ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/class/k-Means_clustering.php

https://github.com/srahn/kvwmap
PHP | 233 lines | 142 code | 22 blank | 69 comment | 21 complexity | b62d92145cdb807ce63622712c331d0c MD5 | raw file
  1. <?php
  2. ###################################################################
  3. # kvwmap - Kartenserver fr Kreisverwaltungen #
  4. ###################################################################
  5. # Lizenz #
  6. # #
  7. # Copyright (C) 2004 Peter Korduan #
  8. # #
  9. # This program is free software; you can redistribute it and/or #
  10. # modify it under the terms of the GNU General Public License as #
  11. # published by the Free Software Foundation; either version 2 of #
  12. # the License, or (at your option) any later version. #
  13. # #
  14. # This program is distributed in the hope that it will be useful, #
  15. # but WITHOUT ANY WARRANTY; without even the implied warranty of #
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
  17. # GNU General Public License for more details. #
  18. # #
  19. # You should have received a copy of the GNU General Public #
  20. # License along with this program; if not, write to the Free #
  21. # Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, #
  22. # MA 02111-1307, USA. #
  23. # #
  24. # Kontakt: #
  25. # peter.korduan@gdi-service.de #
  26. # stefan.rahn@gdi-service.de #
  27. ###################################################################
  28. #############################
  29. # Klasse kMeansClustering #
  30. #############################
  31. class kMeansClustering {
  32. public static function seedsFromLocalMaxima($histogram, $data) {
  33. // number of smoothing operations depends on
  34. // histogram resolution and desirable approx. number
  35. // of clusters, for a gaussian kernel of size 5 it
  36. // can be calculated as:
  37. //
  38. // |H| - |C|
  39. // n = -----------
  40. // 4 * |C|
  41. //
  42. // where |H| is the number of bins in the histogram,
  43. // and |C| is the desirable approx. number of clusters
  44. $glatt = kMeansClustering::glätten($histogram);
  45. $glatt = kMeansClustering::glätten($glatt);
  46. $glatt = kMeansClustering::glätten($glatt);
  47. $glatt = kMeansClustering::glätten($glatt);
  48. #echo '<br>$glatt: ';var_dump($glatt); echo '<br>';
  49. $maxima = kMeansClustering::findLocalMaxima($glatt);
  50. #echo '<br>$maxima: ';var_dump($maxima); echo '<br>';
  51. $dataMax = array_reduce($data, function($p,$c) {return max($p,$c);},-INF);
  52. $dataMin = array_reduce($data, function($p,$c) {return min($p,$c);}, INF);
  53. $scale = ($dataMax - $dataMin)/100.0;
  54. $offset = $dataMin;
  55. $seeds = array_map(function($value) use ($scale, $offset){
  56. return $value * $scale + $offset;
  57. }, $maxima);
  58. return $seeds;
  59. }
  60. public static function kMeansWithSeeds($data, $means){
  61. $classMap=array();
  62. $classMappingChanged = true;
  63. while ($classMappingChanged) {
  64. $classMappingChanged = false;
  65. #echo '<br>MEANS: ';var_dump($means); echo '<br>';
  66. // assignment step
  67. $meansCount = count($means);
  68. array_walk($data, function($value,$dIdx) use (&$classMap, &$classMappingChanged, $means, $meansCount) {
  69. $dist = INF;
  70. $cls = 0;
  71. for ($mIdx = 0; $mIdx < $meansCount; $mIdx++) {
  72. $mDist = pow($value - $means[$mIdx],2);
  73. if ($mDist < $dist) {
  74. $dist = $mDist;
  75. $cls = $mIdx;
  76. }
  77. }
  78. if ($classMap[$dIdx] != "$cls") {
  79. $classMap[$dIdx] = "$cls";
  80. $classMappingChanged = true;
  81. }
  82. });
  83. // update step
  84. $accumulator = array_map(function($item){
  85. return array('sum'=>0,'count'=>0,'mean'=>$item);
  86. }, $means);
  87. array_walk($data, function($value, $dIdx) use (&$accumulator, $classMap) {
  88. $cls = $classMap[$dIdx];
  89. $accumulator[$cls]['sum'] += $value;
  90. $accumulator[$cls]['count']++;
  91. });
  92. // calculate new cluster centers
  93. array_walk($accumulator,function(&$item){
  94. if ($item['count'] != 0) $item['mean'] = $item['sum']/$item['count'];
  95. });
  96. $means = array_map(function($item) {
  97. return $item['mean'];
  98. }, $accumulator);
  99. #echo '<br>COUNTS: ';var_dump(array_map(function($item) {return $item['count'];}, $accumulator)); echo '<br>';
  100. } // while
  101. return $classMap;
  102. }
  103. // divide data into $numCls clusters by 'divide and conquer'-approach
  104. // starting with a single cluster, every iteration the cluster with highest
  105. // residual energy is iteratively split into two, until the number of
  106. // clusters equals $numCls
  107. public static function kMeansNoSeeds($data, $numCls){
  108. // check numCls
  109. $numCls = max(2, min($numCls, kMeansClustering::getNumberOfUniqueValues($data)));
  110. // calculate offset for cluster-splitting (e.g. 1/256th of the average cluster size)
  111. $dataMax = array_reduce($data, function($p,$c) {return max($p,$c);},-INF);
  112. $dataMin = array_reduce($data, function($p,$c) {return min($p,$c);}, INF);
  113. $offset = ($dataMax - $dataMin) / ($numCls << 8);
  114. // initial clustering
  115. $accumulator = array(array('sum'=>0,'count'=>count($data),'mean'=>$data[0]));
  116. $classMap=array_map(function($item){return "0";}, $data);
  117. // initialize exit condition
  118. $classMappingChanged = false;
  119. while ($classMappingChanged || count($accumulator) < $numCls) {
  120. // update step
  121. $accumulator = array_map(function($item){return array('sum'=>0,'sqSum'=>0,'count'=>0,'mean'=>$item['mean']);}, $accumulator);
  122. array_walk($data, function($value, $dIdx) use (&$accumulator, $classMap) {
  123. $cls = $classMap[$dIdx];
  124. $accumulator[$cls]['sum'] += $value;
  125. $accumulator[$cls]['sqSum'] += $value*$value;
  126. $accumulator[$cls]['count']++;
  127. });
  128. // calculate new cluster centers and energy
  129. array_walk($accumulator, function(&$item){
  130. if ($item['count'] != 0) {
  131. $item['mean'] = $item['sum'] / $item['count'];
  132. //$item['energy'] = ($item['sqSum'] / $item['count']) - ($item['mean'] * $item['mean']); // normalized energy
  133. $item['energy'] = $item['sqSum'] - ($item['mean'] * $item['mean'] * $item['count']); // total energy
  134. }
  135. });
  136. #echo '<br>ACCU : ';var_dump($accumulator); echo '<br>';
  137. #echo '<br>MEANS: ';var_dump(array_map(function($item) {return $item['mean'];}, $accumulator)); echo '<br>';
  138. #echo '<br>COUNTS: ';var_dump(array_map(function($item) {return $item['count'];}, $accumulator)); echo '<br>';
  139. #echo '<br>ENERGY: ';var_dump(array_map(function($item) {return $item['energy'];}, $accumulator)); echo '<br>';
  140. // split step
  141. if (count($accumulator) < $numCls && !$classMappingChanged) {
  142. // add a new cluster by splitting the cluster with the highest energy
  143. // - get cluster with highest energy
  144. $idxMap = array_map(function($item,$idx) {
  145. return array('idx'=> $idx, 'energy'=>$item['energy']);
  146. }, $accumulator, array_keys($accumulator));
  147. array_unshift($idxMap, array('idx'=>0, 'energy'=>0));
  148. array_walk($idxMap, function($item, $idx) use (&$idxMap) {
  149. if ($item['energy'] > $idxMap[0]['energy']) {
  150. $idxMap[0]['energy'] = $item['energy'];
  151. $idxMap[0]['idx'] = $item['idx'];
  152. }
  153. });
  154. #$idxOfSplitCluster = (array_shift($idxMap))["idx"];
  155. $idxOfSplitCluster = array_shift($idxMap);
  156. $idxOfSplitCluster = $idxOfSplitCluster["idx"];
  157. // - add new cluster
  158. $accumulator = array_merge(
  159. array_slice($accumulator, 0, $idxOfSplitCluster),
  160. array(
  161. array('sum'=>0, 'count'=>0,'mean'=>$accumulator[$idxOfSplitCluster]['mean'] - $offset),
  162. array('sum'=>0, 'count'=>0,'mean'=>$accumulator[$idxOfSplitCluster]['mean'] + $offset)
  163. ),
  164. array_slice($accumulator, $idxOfSplitCluster+1)
  165. );
  166. $classMappingChanged = true;
  167. } else {
  168. // reset exit condition
  169. $classMappingChanged = false;
  170. }
  171. // assignment step
  172. array_walk($data, function($value, $dIdx) use (&$classMap, &$classMappingChanged, $accumulator) {
  173. $dist = INF;
  174. $cls = 0;
  175. for ($mIdx = 0; $mIdx < count($accumulator); $mIdx++) {
  176. $mDist = pow($value - $accumulator[$mIdx]['mean'], 2);
  177. if ($mDist < $dist) {
  178. $dist = $mDist;
  179. $cls = $mIdx;
  180. }
  181. }
  182. if ($classMap[$dIdx] != $cls) {
  183. $classMap[$dIdx] = $cls;
  184. $classMappingChanged = true;
  185. }
  186. });
  187. } // while
  188. return array_map(function($item){return $item['mean']; }, $accumulator);
  189. }
  190. function getNumberOfUniqueValues($array){
  191. return count(array_unique($array, SORT_REGULAR));
  192. }
  193. function glätten($array) {
  194. $length = count($array);
  195. return array_map(function($value,$idx) use ($array, $length){
  196. return 0.4*$value
  197. + 0.25*(($idx+1 < $length ? $array[$idx+1] : 0) + ($idx-1 >= 0 ? $array[$idx-1] : 0))
  198. + 0.05*(($idx+2 < $length ? $array[$idx+2] : 0) + ($idx-2 >= 0 ? $array[$idx-2] : 0));
  199. }, $array, array_keys($array));
  200. }
  201. function findLocalMaxima($array){
  202. $maxima = array();
  203. array_walk($array, function($value, $idx, $userdata) {
  204. $left = $idx > 0 ? $idx - 1 : $idx;
  205. $right = $idx < $userdata['maxIdx'] ? $idx + 1 : $idx;
  206. if ($value > $userdata['array'][$left] && $value >= $userdata['array'][$right]) $userdata['maxima'][] = $idx;
  207. }, array('maxima'=> &$maxima, 'array'=>$array, 'maxIdx' => count($array)-1));
  208. return $maxima;
  209. }
  210. }
  211. ?>