PageRenderTime 46ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/qt/semantic_engine/src/clusters.cpp

http://semantic-engine.googlecode.com/
C++ | 397 lines | 291 code | 84 blank | 22 comment | 39 complexity | 9f8ae34f05967ae7e4d1f8edcb71964e MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause
  1. #include <iostream>
  2. // semantic
  3. #include <semantic/analysis/linlog.hpp>
  4. #include <semantic/analysis/agglomerate_clustering/cluster_helper.hpp>
  5. #include "clusters.h"
  6. #include "single_result.h"
  7. void ClusterWidget::calculateClusters(){
  8. numberOfClusters->setEnabled(false);
  9. calculateClustersButton->setEnabled(false);
  10. numberOfClusters->setHidden(false);
  11. clusterNumberLabel->setHidden(false);
  12. startThread();
  13. // doClustering();
  14. }
  15. void ClusterWidget::updateProgress(int s, int total) {
  16. if (clusterProgress->maximum() != total) clusterProgress->setMaximum(total);
  17. clusterProgress->setValue(s);
  18. }
  19. ClusterWidget::ClusterWidget(Graph *g, QWidget *parent)
  20. : QWidget(parent), m_graph(g)
  21. {
  22. setupLayout();
  23. setupConnections();
  24. }
  25. void ClusterWidget::setStatus(QString status){
  26. searchStatus->setStatus(status);
  27. }
  28. void ClusterWidget::startSearch(){
  29. calculateClustersButton->setVisible(false);
  30. clusterProgress->setVisible(false);
  31. clusterTree->setVisible(false);
  32. numberOfClusters->setVisible(false);
  33. clusterNumberLabel->setVisible(false);
  34. setStatus(tr("Searching ..."));
  35. searchStatus->setVisible(true);
  36. }
  37. void ClusterWidget::displayResults(searchType type){
  38. if( clusterTree->topLevelItemCount() > 0){
  39. if( type != KeywordSearch ){
  40. searchStatus->setVisible(false);
  41. clusterNumberLabel->setVisible(true);
  42. clusterProgress->setVisible(true);
  43. calculateClustersButton->setVisible(true);
  44. clusterTree->setVisible(true);
  45. numberOfClusters->setVisible(true);
  46. } else {
  47. setStatus(tr("Clustering disabled on keyword searches"));
  48. }
  49. } else {
  50. setStatus(tr("No Results"));
  51. }
  52. }
  53. void ClusterWidget::setResultData(QStringList topTerms, QList<QPair<QString,double> > resultData){
  54. numberOfClusters->setHidden(true);
  55. clusterNumberLabel->setHidden(true);
  56. maxClusters = resultData.count() - 1;
  57. QString countStr = QVariant(resultData.count()).toString();
  58. QStringList labels;
  59. QStringList topFive;
  60. for( int i=0; i < topTerms.count() && i < 5; ++i){
  61. topFive << topTerms.at(i);
  62. }
  63. labels << countStr << topFive.join(", ");
  64. clusterTree->clear();
  65. if( resultData.count() > 0){
  66. QTreeWidgetItem *tResults = new QTreeWidgetItem(labels);
  67. tResults->setBackground(0,m_backgroundColor);
  68. tResults->setBackground(1,m_backgroundColor);
  69. for( int i = 0; i < resultData.count(); ++i){
  70. QStringList tLabels;
  71. QString title = resultData.at(i).first;
  72. tLabels << QString() << title;
  73. QTreeWidgetItem *item = new QTreeWidgetItem(tLabels);
  74. item->setFlags(Qt::ItemIsDragEnabled | Qt::ItemIsEnabled | Qt::ItemIsSelectable);
  75. tResults->addChild(item);
  76. }
  77. clusterTree->addTopLevelItem(tResults);
  78. clusterTree->expandAll();
  79. calculateClustersButton->setEnabled(true);
  80. }
  81. }
  82. void ClusterWidget::setupConnections(){
  83. connect(calculateClustersButton, SIGNAL(clicked()), this, SLOT(calculateClusters()));
  84. connect(clusterTree, SIGNAL(itemDoubleClicked(QTreeWidgetItem *,int)),this, SLOT(displayItem(QTreeWidgetItem *,int)));
  85. connect(numberOfClusters, SIGNAL(valueChanged(int)), this, SLOT(numClustersChanged(int)));
  86. }
  87. void ClusterWidget::displayItem(QTreeWidgetItem *item,int){
  88. int col = 1; // valid column is always 1
  89. QString title = item->data(col,Qt::DisplayRole).toString();
  90. // allow double clicking only on child items
  91. if( !(item->flags() &= Qt::ItemIsUserCheckable | Qt::ItemIsDropEnabled) ){
  92. if( QFile::exists( title ) ){
  93. QDesktopServices::openUrl(QUrl::fromLocalFile(title));
  94. } else {
  95. /* emit */ displaySingleResultWindow(title);
  96. }
  97. } else {
  98. if( item->isExpanded() ){
  99. item->setExpanded(false);
  100. } else {
  101. item->setExpanded(true);
  102. }
  103. }
  104. }
  105. void ClusterWidget::setupLayout(){
  106. // cluster tree
  107. clusterTree = new ClusterTreeWidget;
  108. m_backgroundColor = QBrush(QColor(160,206,221));
  109. // buttons and progress bar
  110. QHBoxLayout *statusLayout = new QHBoxLayout;
  111. clusterProgress = new QProgressBar;
  112. calculateClustersButton = new QPushButton(tr("Create Clusters"));
  113. calculateClustersButton->setEnabled(false);
  114. numberOfClusters = new QSpinBox;
  115. numberOfClusters->setMinimum(1);
  116. numberOfClusters->setMaximum(20);
  117. numberOfClusters->setHidden(true);
  118. clusterNumberLabel = new QLabel(tr("Number of Clusters"));
  119. clusterNumberLabel->setHidden(true);
  120. statusLayout->addWidget(clusterProgress);
  121. statusLayout->addWidget(clusterNumberLabel);
  122. statusLayout->addWidget(numberOfClusters);
  123. statusLayout->addWidget(calculateClustersButton);
  124. statusLayout->setAlignment(Qt::AlignRight);
  125. searchStatus = new SearchStatus();
  126. searchStatus->setAlignment(Qt::AlignCenter);
  127. searchStatus->setVisible(false);
  128. QVBoxLayout *mainLayout = new QVBoxLayout;
  129. mainLayout->addSpacing(20);
  130. mainLayout->addWidget(clusterTree);
  131. mainLayout->addLayout(statusLayout);
  132. mainLayout->addWidget(searchStatus);
  133. setLayout(mainLayout);
  134. }
  135. ClusterTreeWidget::ClusterTreeWidget(QTreeWidget *parent)
  136. : QTreeWidget(parent)
  137. {
  138. setColumnCount(2);
  139. setSelectionMode(QAbstractItemView::ExtendedSelection);
  140. //setDragDropMode(QAbstractItemView::DragOnly);
  141. //setDragEnabled(true);
  142. QStringList labels;
  143. labels << tr("Document Count") << tr("Cluster");
  144. setHeaderLabels(labels);
  145. setTextElideMode(Qt::ElideMiddle);
  146. }
  147. Qt::DropActions ClusterTreeWidget::supportedDropActions() const {
  148. return Qt::CopyAction;
  149. }
  150. void ClusterTreeWidget::mouseMoveEvent(QMouseEvent *){
  151. QItemSelectionModel *selections = this->selectionModel();
  152. QModelIndexList list = selections->selectedRows(1);
  153. for( int i = 0; i < list.size(); ++i){
  154. QDrag *drag = new QDrag(this);
  155. QModelIndex index = list.at(i);
  156. // don't allow drags for top-level items
  157. if( !(index.flags() &= Qt::ItemIsUserCheckable | Qt::ItemIsDropEnabled) ){
  158. QMimeData *mimeData = new QMimeData;
  159. mimeData->setHtml(index.data(Qt::DisplayRole).toString());
  160. drag->setMimeData(mimeData);
  161. drag->start();
  162. }
  163. }
  164. }
  165. void ClusterWidget::setWeightMap(QPair<WeightingTraits::edge_weight_map,WeightingTraits::vertex_weight_map> weightMap){
  166. m_weightMap = weightMap.first;
  167. m_rankMap = weightMap.second;
  168. }
  169. ClusteringThread::ClusteringThread(Graph *g, WeightMapType w, QObject *parent)
  170. : QThread(parent), graph_(g), weights_(w), dendrogram_(*graph_), safeTerminate_(false) {}
  171. void ClusteringThread::safeTerminate() {
  172. safeTerminate_ = true;
  173. }
  174. void ClusteringThread::run() {
  175. startedClustering();
  176. unsigned int max_num_clusters = num_vertices(*graph_)-1;
  177. if (max_num_clusters>20) max_num_clusters = 20;
  178. if( max_num_clusters < 3 ){
  179. return;
  180. }
  181. LinLog::all_maps maps;
  182. LinLogHelper::populate_all_maps(*graph_, weights_, maps);
  183. // create a minimizer
  184. semantic::analysis::detail::MinimizerBarnesHutHelper<LinLog::all_maps>
  185. minimizer(maps, 1.0f, 0.0f, 0.01f);
  186. const int iterations = 100;
  187. minimizer.setIterations(iterations);
  188. int step;
  189. while((step = minimizer.step()) != -1) {
  190. if (safeTerminate_) return;
  191. /*emit*/ progress(step, iterations + max_num_clusters);
  192. }
  193. progress(iterations, iterations + max_num_clusters);
  194. if (safeTerminate_) return;
  195. // use the position map to generate a distance map
  196. typedef semantic::se_graph_traits<Graph>::vertex_pair_edge edge;
  197. typedef semantic::se_graph_traits<Graph>::vertex_descriptor vertex;
  198. maps::unordered<edge, double> distances;
  199. maps::unordered<vertex, maps::unordered<vertex, double> > distances_2d;
  200. BGL_FORALL_VERTICES(u, *graph_, Graph) {
  201. if (safeTerminate_) return;
  202. BGL_FORALL_VERTICES(v, *graph_, Graph) {
  203. edge e1(u,v), e2(v,u);
  204. double distance = maps.position[maps.vertex_index[u]].dist(maps.position[maps.vertex_index[v]]);
  205. distances[e1] = distances[e2] = distance;
  206. distances_2d[u][v] = distances_2d[v][u] = distance;
  207. }
  208. }
  209. if (safeTerminate_) return;
  210. // now generate an MST
  211. std::vector<edge> mst;
  212. semantic::minimum_weight_spanning_tree(*graph_, extract_keys(distances.begin()), extract_keys(distances.end()), boost::make_assoc_property_map(distances), back_inserter(mst));
  213. if (safeTerminate_) return;
  214. // create the dendrogram
  215. semantic::dendrogram_from_distance_mst(*graph_, mst, boost::make_assoc_property_map(distances), dendrogram_, SingleLinkDistanceCalculator());
  216. // now let's silhouette this baby
  217. // measuring clusters between having 2 and having n-1 (or 30, whichever is smaller)... test silhouettes
  218. unsigned int best_num = 0;
  219. double best_silhouette = 0;
  220. for(unsigned int i = 2; i < max_num_clusters; i++) {
  221. maps::unordered<vertex, unsigned long> cluster_map;
  222. dendrogram_.set_num_clusters(i);
  223. dendrogram_.get_clusters(inserter(cluster_map, cluster_map.begin()));
  224. // silhouette!!!
  225. maps::unordered<vertex, double> quality_map;
  226. double quality = semantic::analysis::silhouette(*graph_, boost::make_assoc_property_map(cluster_map), boost::make_assoc_property_map(distances_2d), boost::make_assoc_property_map(quality_map));
  227. if (quality > best_silhouette) {
  228. best_silhouette = quality;
  229. best_num = i;
  230. }
  231. progress(iterations + i, iterations + max_num_clusters);
  232. }
  233. if (best_num < 2) best_num = 5; // why not
  234. dendrogram_.set_num_clusters(best_num);
  235. progress(iterations + max_num_clusters, iterations + max_num_clusters);
  236. finishedClustering();
  237. }
  238. semantic::dendrogram<Graph> & ClusteringThread::dendrogram() {
  239. return dendrogram_;
  240. }
  241. void ClusterWidget::numClustersChanged(int num) {
  242. if (!m_clusteringThread || m_clusteringThread->isRunning()) return;
  243. m_clusteringThread->dendrogram().set_num_clusters(num);
  244. clusteringCompleted(); // rebuild the view
  245. }
  246. void ClusterWidget::clusteringStarted() {
  247. clusterProgress->setEnabled(true);
  248. }
  249. void ClusterWidget::clusteringCompleted() {
  250. numberOfClusters->setEnabled(true);
  251. calculateClustersButton->setEnabled(true);
  252. typedef GraphTraits::vertex_descriptor vertex;
  253. // rebuild the view
  254. cluster_helper<Graph> helper;
  255. helper.build( *m_graph,
  256. m_clusteringThread->dendrogram(),
  257. boost::make_assoc_property_map(m_rankMap));
  258. unsigned int count = helper.count();
  259. numberOfClusters->setValue(count);
  260. clusterTree->clear(); // just in case
  261. for(unsigned int i = 0; i < count; i++) {
  262. std::vector<vertex> docs, terms;
  263. docs = helper.docs(i);
  264. terms = helper.terms(i);
  265. std::vector<std::string> terms_str;
  266. for(unsigned int k = 0; k < terms.size() && k < 5; k++) {
  267. std::string term = (*m_graph)[terms[k]].content;
  268. term = m_graph->unstem_term(term);
  269. if( term.size() ){
  270. terms_str.push_back( term );
  271. }
  272. }
  273. std::string terms_list = join(terms_str.begin(), terms_str.end(), ", ");
  274. QTreeWidgetItem *item = new QTreeWidgetItem(QStringList() << QString::number(docs.size()) << QString::fromStdString(terms_list));
  275. item->setBackground(0, m_backgroundColor);
  276. item->setBackground(1, m_backgroundColor);
  277. // get the document names now
  278. for(unsigned int k = 0; k < docs.size(); k++) {
  279. std::string doc_title = (*m_graph)[docs[k]].content;
  280. QString title = QString::fromStdString(doc_title);
  281. /* title = title.replace(QRegExp("&nbsp;?"), " ");
  282. title = title.replace(QRegExp("&#333;?"), "o");
  283. title = title.replace(QRegExp("&#332;?"), "O");
  284. title = title.replace(QRegExp("&#363;?"), "u");
  285. title = title.replace(QRegExp("&#362;?"), "U"); */
  286. QTreeWidgetItem *docItem = new QTreeWidgetItem(QStringList() << QString() << title);
  287. docItem->setFlags(Qt::ItemIsDragEnabled | Qt::ItemIsEnabled | Qt::ItemIsSelectable);
  288. item->addChild(docItem);
  289. }
  290. if( docs.size() > 0 ){
  291. clusterTree->addTopLevelItem(item);
  292. }
  293. }
  294. clusterProgress->reset();
  295. clusterProgress->setEnabled(false);
  296. }
  297. void ClusterWidget::startThread() {
  298. m_clusteringThread = new ClusteringThread(m_graph, boost::make_assoc_property_map(m_weightMap));
  299. connect(m_clusteringThread, SIGNAL(startedClustering()), this, SLOT(clusteringStarted()));
  300. connect(m_clusteringThread, SIGNAL(finishedClustering()), this, SLOT(clusteringCompleted()));
  301. connect(m_clusteringThread, SIGNAL(progress(int, int)), this, SLOT(updateProgress(int, int)));
  302. m_clusteringThread->start();
  303. }
  304. void ClusterWidget::killThread() {
  305. if (m_clusteringThread!=0) {
  306. std::cerr << "stopping clustering thread" << std::endl;
  307. m_clusteringThread->safeTerminate();
  308. m_clusteringThread->wait();
  309. m_clusteringThread->deleteLater();
  310. m_clusteringThread = 0;
  311. }
  312. clusterProgress->setEnabled(false);
  313. }