Home  · Classes  · Annotated Classes  · Modules  · Members  · Namespaces  · Related Pages
HierarchicalClustering.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2013.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Lars Nilse $
32 // $Authors: Bastian Blank $
33 // --------------------------------------------------------------------------
34 
35 #include <cmath>
36 #include <limits>
37 #include <map>
38 #include <queue>
39 #include <boost/unordered/unordered_set.hpp>
40 
42 #include <OpenMS/CONCEPT/Types.h>
43 
44 #ifndef OPENMS_COMPARISON_CLUSTERING_HIERARCHICALCLUSTERING_H
45 #define OPENMS_COMPARISON_CLUSTERING_HIERARCHICALCLUSTERING_H
46 
47 namespace OpenMS
48 {
66  template <typename PointRef>
68  {
69 public:
75 
80  class BoundingBox :
81  public std::pair<PointCoordinate, PointCoordinate>
82  {
83 public:
85  std::pair<PointCoordinate, PointCoordinate>(std::make_pair(p, p))
86  {}
87 
88  BoundingBox(const BoundingBox & b) :
89  std::pair<PointCoordinate, PointCoordinate>(b)
90  {}
91 
93  {
94  return this->second - this->first;
95  }
96 
99  {
100  typename PointCoordinate::iterator lit;
101  typename PointCoordinate::const_iterator rit;
102 
103  // Calculate lower bound
104  lit = this->first.begin(); rit = rhs.first.begin();
105  for (; lit != this->first.end(); ++lit, ++rit) *lit = std::min(*lit, *rit);
106 
107  // Calculate upper bound
108  lit = this->second.begin(); rit = rhs.second.begin();
109  for (; lit != this->second.end(); ++lit, ++rit) *lit = std::max(*lit, *rit);
110 
111  return *this;
112  }
113 
115  BoundingBox operator|(const BoundingBox & rhs) const
116  {
117  BoundingBox ret(*this);
118  ret |= rhs;
119  return ret;
120  }
121 
122  operator PointCoordinate() const
123  {
124  // (first + second) / 2
125  return coordScalarDiv_(this->first + this->second, 2);
126  }
127  };
128 
133  class Cluster :
134  public boost::unordered_multimap<PointCoordinate, PointRef>
135  {
136 public:
138 
140  bbox(bbox)
141  {}
142  };
143 
148 
155 
156 protected:
158  class TreeNode
159  {
160 public:
165  const bool center;
166  const PointRef ref;
167 
168  TreeNode(const PointCoordinate & coord, const PointRef & ref, bool center) :
169  coord(coord), bbox(coord), left(0), right(0), points(1), center(center), ref(ref)
170  {}
171 
173  coord(coord), bbox(bbox),
174  left(left), right(right),
175  points(left->points + right->points),
176  center(left->center && right->center),
177  ref(PointRef())
178  {}
179  };
180 
181  typedef std::map<typename Grid::CellIndex, std::pair<typename Grid::CellContent *, bool> > ClusterCells;
182  typedef boost::unordered_set<TreeNode *> ClusterTrees;
183 
186  {
187 public:
190 
192  distance(distance), left(left), right(right)
193  {}
194 
195  bool operator>(const TreeDistance & rhs) const
196  {
197  return distance > rhs.distance;
198  }
199 
200  };
201 
203  typedef std::priority_queue<TreeDistance, std::vector<TreeDistance>, std::greater<TreeDistance> > TreeDistanceQueue;
204 
205 public:
210  HierarchicalClustering(const PointCoordinate & cluster_dimension) :
211  grid(cluster_dimension)
212  {}
213 
220  typename Grid::cell_iterator insertPoint(const PointCoordinate & d, const PointRef & ref)
221  {
222  typename Grid::cell_iterator it = insertCluster_(d);
223  it->second.insert(std::make_pair(d, ref));
224  return it;
225  }
226 
230  void cluster()
231  {
232  // Collect coordinates of all active cells
233  std::vector<typename Grid::CellIndex> cells;
234  for (typename Grid::const_grid_iterator it = grid.grid_begin(); it != grid.grid_end(); ++it)
235  cells.push_back(it->first);
236  // Cluster each available cell
237  for (typename std::vector<typename Grid::CellIndex>::const_iterator it = cells.begin(); it != cells.end(); ++it)
238  clusterIndex_(*it);
239  }
240 
241 protected:
247  template <class P>
248  typename Grid::cell_iterator insertCluster_(const P & p)
249  {
250  return grid.insert(std::make_pair(p, Cluster(p)));
251  }
252 
257  void clusterIndex_(const typename Grid::CellIndex & p);
258 
267  void gridCells5x5_(typename Grid::CellIndex cur, ClusterCells & cells);
268 
276  void gridCell_(const typename Grid::CellIndex & cur, ClusterCells & cells, bool center = false, bool ignore_missing = true)
277  {
278  try
279  {
280  cells.insert(std::make_pair(cur, std::make_pair(&grid.grid_at(cur), center)));
281  }
282  catch (std::out_of_range &)
283  {
284  if (!ignore_missing) throw;
285  }
286  }
287 
291  void addTreeDistance_(TreeNode * tree, ClusterTrees & trees, TreeDistanceQueue & dists)
292  {
293  // Infinity: no valid distance
294  DoubleReal dist_min = std::numeric_limits<DoubleReal>::infinity();
295  typename ClusterTrees::const_iterator dist_it = trees.end();
296 
297  // Generate minimal distance to existing trees
298  for (typename ClusterTrees::const_iterator it = trees.begin(); it != trees.end(); ++it)
299  {
300  if (tree == *it) continue;
301  DoubleReal dist = treeDistance_(tree, *it);
302  if (dist < dist_min)
303  {
304  dist_min = dist;
305  dist_it = it;
306  }
307  }
308 
309  // Insert distance if valid one found.
310  if (dist_it != trees.end()) dists.push(TreeDistance(dist_min, tree, *dist_it));
311 
312  // Insert tree.
313  trees.insert(tree);
314  }
315 
322  DoubleReal treeDistance_(TreeNode * left, TreeNode * right)
323  {
324  const BoundingBox bbox = left->bbox | right->bbox;
325  if (coordElemGreater_(bbox.size(), grid.cell_dimension))
326  {
327  return std::numeric_limits<DoubleReal>::infinity();
328  }
329 
330  const PointCoordinate left_scaled = coordElemDiv_(left->coord, grid.cell_dimension);
331  const PointCoordinate right_scaled = coordElemDiv_(right->coord, grid.cell_dimension);
332  return coordDist_(left_scaled, right_scaled);
333  }
334 
341  void tree2Cluster_(const TreeNode * tree, Cluster & cluster)
342  {
343  if (tree->left && tree->right)
344  {
345  tree2Cluster_(tree->left, cluster);
346  tree2Cluster_(tree->right, cluster);
347  }
348  else
349  {
350  cluster.insert(std::make_pair(tree->bbox.first, tree->ref));
351  }
352  delete tree->left;
353  delete tree->right;
354  }
355 
361  void tree2Points_(const TreeNode * tree)
362  {
363  if (tree->left && tree->right)
364  {
365  tree2Points_(tree->left);
366  tree2Points_(tree->right);
367  }
368  else
369  {
370  insertPoint(tree->bbox.first, tree->ref);
371  }
372  delete tree->left;
373  delete tree->right;
374  }
375 
377  {
378  PointCoordinate ret;
379  typename PointCoordinate::iterator it = ret.begin();
380  typename PointCoordinate::const_iterator lit = lhs.begin();
381  for (; it != ret.end(); ++it, ++lit) *it = *lit / rhs;
382  return ret;
383  }
384 
386  {
387  PointCoordinate ret;
388  typename PointCoordinate::iterator it = ret.begin();
389  typename PointCoordinate::const_iterator lit = lhs.begin(), rit = rhs.begin();
390  for (; it != ret.end(); ++it, ++lit, ++rit) *it = *lit / *rit;
391  return ret;
392  }
393 
394  static bool coordElemGreater_(const PointCoordinate & lhs, const PointCoordinate & rhs)
395  {
396  typename PointCoordinate::const_iterator lit = lhs.begin(), rit = rhs.begin();
397  for (; lit != lhs.end(); ++lit, ++rit)
398  {
399  if (*lit > *rit) return true;
400  }
401  return false;
402  }
403 
404  static DoubleReal coordDist_(const PointCoordinate & lhs, const PointCoordinate & rhs)
405  {
406  DoubleReal ret = 0;
407  PointCoordinate p = lhs - rhs;
408  typename PointCoordinate::const_iterator it = p.begin();
409  for (; it != p.end(); ++it) ret += std::pow(*it, 2.);
410  return std::sqrt(ret);
411  }
412 
413  };
414 
415  template <typename I>
416  void HierarchicalClustering<I>::clusterIndex_(const typename Grid::CellIndex & cur)
417  {
418  ClusterCells cells;
419  ClusterTrees trees;
420  TreeDistanceQueue dists;
421 
422  // Collect all cells we need
423  try
424  {
425  gridCells5x5_(cur, cells);
426  }
427  catch (std::out_of_range &)
428  {
429  return;
430  }
431 
432  // Collect and remove existing points from cells
433  for (typename ClusterCells::iterator cell_it = cells.begin(); cell_it != cells.end(); ++cell_it)
434  {
435  typename Grid::CellContent & cell_cur = *cell_it->second.first;
436  const bool & cell_center = cell_it->second.second;
437 
438  // Iterate per cluster
439  typename Grid::cell_iterator cluster_tmp_it = cell_cur.begin();
440  while (cluster_tmp_it != cell_cur.end())
441  {
442  typename Grid::cell_iterator cluster_it = cluster_tmp_it;
443  ++cluster_tmp_it;
444 
445  // Check if it is not yet a cluster, aka have only one point
446  if (cluster_it->second.size() == 1)
447  {
448  // Add each point to hash grid
449  for (typename Cluster::const_iterator point_it = cluster_it->second.begin(); point_it != cluster_it->second.end(); ++point_it)
450  {
451  const PointCoordinate & coord = point_it->first;
452  TreeNode * tree(new TreeNode(coord, point_it->second, cell_center));
453  addTreeDistance_(tree, trees, dists);
454  }
455 
456  // Remove point from hash grid cell
457  cell_cur.erase(cluster_it);
458  }
459  }
460  }
461 
462  // Try to join two subsets with minimum distance
463  while (!dists.empty())
464  {
465  const typename TreeDistanceQueue::value_type cur_dist = dists.top();
466  TreeNode * tree_left(cur_dist.left), *tree_right(cur_dist.right);
467  dists.pop();
468 
469  // Check if both trees are not yet used with a smaller distance
470  Size count_left = trees.count(tree_left), count_right = trees.count(tree_right);
471  if (count_left && count_right)
472  {
473  trees.erase(tree_left);
474  trees.erase(tree_right);
475 
476  const BoundingBox bbox = tree_left->bbox | tree_right->bbox;
477 
478  // Arithmethic mean: (left * left.points + right * right.points) / (left.points + right.points)
479  const PointCoordinate & left = tree_left->coord, & right = tree_right->coord;
480  const UInt & left_points = tree_left->points, & right_points = tree_right->points;
481  const PointCoordinate coord = coordScalarDiv_(left * left_points + right * right_points, left_points + right_points);
482 
483  TreeNode * tree(new TreeNode(coord, bbox, tree_left, tree_right));
484 
485  addTreeDistance_(tree, trees, dists);
486  }
487  // Re-add a distance for the tree not yet used.
488  // Otherwise this subset is lost even if it is not yet maximal.
489  else if (count_left)
490  addTreeDistance_(tree_left, trees, dists);
491  else if (count_right)
492  addTreeDistance_(tree_right, trees, dists);
493  }
494 
495  // Add data back to grid
496  for (typename ClusterTrees::iterator tree_it = trees.begin(); tree_it != trees.end(); ++tree_it)
497  {
498  // We got a finished tree with all points in the center, add cluster at centroid
499  if ((**tree_it).center)
500  {
501  Cluster & cluster = insertCluster_((**tree_it).bbox)->second;
502  tree2Cluster_(*tree_it, cluster);
503  }
504  // We got a finished tree but not all points in the center, readd as single points
505  else
506  {
507  tree2Points_(*tree_it);
508  }
509  delete *tree_it;
510  }
511  }
512 
513  template <typename I>
514  void HierarchicalClustering<I>::gridCells5x5_(typename Grid::CellIndex base, ClusterCells & cells)
515  {
516  // (0, 0)
517  gridCell_(base, cells, true, false);
518 
519  typename Grid::CellIndex cur = base;
520  cur[0] -= 2;
521  // (-2, -2)
522  cur[1] -= 2; gridCell_(cur, cells);
523  // (-2, -1)
524  cur[1] += 1; gridCell_(cur, cells);
525  // (-2, 0)
526  cur[1] += 1; gridCell_(cur, cells);
527  // (-2, 1)
528  cur[1] += 1; gridCell_(cur, cells);
529  // (-2, 2)
530  cur[1] += 1; gridCell_(cur, cells);
531 
532  cur = base; cur[0] -= 1;
533  // (-1, -2)
534  cur[1] -= 2; gridCell_(cur, cells);
535  // (-1, -1)
536  cur[1] += 1; gridCell_(cur, cells, true);
537  // (-1, 0)
538  cur[1] += 1; gridCell_(cur, cells, true);
539  // (-1, 1)
540  cur[1] += 1; gridCell_(cur, cells, true);
541  // (-1, 2)
542  cur[1] += 1; gridCell_(cur, cells);
543 
544  cur = base;
545  // (0, -2)
546  cur[1] -= 2; gridCell_(cur, cells);
547  // (0, -1)
548  cur[1] += 1; gridCell_(cur, cells, true);
549  // (0, 0)
550  cur[1] += 1;
551  // (0, 1)
552  cur[1] += 1; gridCell_(cur, cells, true);
553  // (0, 2)
554  cur[1] += 1; gridCell_(cur, cells);
555 
556  cur = base; cur[0] += 1;
557  // (1, -2)
558  cur[1] -= 2; gridCell_(cur, cells);
559  // (1, -1)
560  cur[1] += 1; gridCell_(cur, cells, true);
561  // (1, 0)
562  cur[1] += 1; gridCell_(cur, cells, true);
563  // (1, 1)
564  cur[1] += 1; gridCell_(cur, cells, true);
565  // (1, 2)
566  cur[1] += 1; gridCell_(cur, cells);
567 
568  cur = base; cur[0] += 2;
569  // (2, -2)
570  cur[1] -= 2; gridCell_(cur, cells);
571  // (2, -1)
572  cur[1] += 1; gridCell_(cur, cells);
573  // (2, 0)
574  cur[1] += 1; gridCell_(cur, cells);
575  // (2, 1)
576  cur[1] += 1; gridCell_(cur, cells);
577  // (2, 2)
578  cur[1] += 1; gridCell_(cur, cells);
579  }
580 
581 }
582 
583 #endif /* OPENMS_COMPARISON_CLUSTERING_HIERARCHICALCLUSTERING_H */
Container for (2-dimensional coordinate, value) pairs.
Definition: HashGrid.h:62
void tree2Points_(const TreeNode *tree)
Recursively add the points of an unfinished cluster back to the grid. All points are saved in the lea...
Definition: HierarchicalClustering.h:361
DoubleReal treeDistance_(TreeNode *left, TreeNode *right)
Returns distance of two tree nodes Returns the euclidean distance of the coordinates of the two trees...
Definition: HierarchicalClustering.h:322
BoundingBox(const PointCoordinate &p)
Definition: HierarchicalClustering.h:84
const CoordinateType * const_iterator
Definition: DPosition.h:75
DoubleReal distance
Definition: HierarchicalClustering.h:188
Grid grid
The hash grid.
Definition: HierarchicalClustering.h:154
Generic 2-dimensional hierarchical clustering with geometric hashing.
Definition: HierarchicalClustering.h:67
void gridCell_(const typename Grid::CellIndex &cur, ClusterCells &cells, bool center=false, bool ignore_missing=true)
Collect one cell.
Definition: HierarchicalClustering.h:276
static PointCoordinate coordScalarDiv_(const PointCoordinate &lhs, const DoubleReal &rhs)
Definition: HierarchicalClustering.h:376
const BoundingBox bbox
Definition: HierarchicalClustering.h:162
boost::unordered_set< TreeNode * > ClusterTrees
Definition: HierarchicalClustering.h:182
BoundingBox & operator|=(const BoundingBox &rhs)
Intersection of bounding box.
Definition: HierarchicalClustering.h:98
BoundingBox(const BoundingBox &b)
Definition: HierarchicalClustering.h:88
Set of points. Describes a cluster on the grid. A point consists of a PointCoordinate and a PointRef...
Definition: HierarchicalClustering.h:133
const PointCoordinate coord
Definition: HierarchicalClustering.h:161
Grid::cell_iterator insertPoint(const PointCoordinate &d, const PointRef &ref)
Insert new PointCoordinate into grid.
Definition: HierarchicalClustering.h:220
const PointRef ref
Definition: HierarchicalClustering.h:166
static PointCoordinate coordElemDiv_(const PointCoordinate &lhs, const PointCoordinate &rhs)
Definition: HierarchicalClustering.h:385
ConstIterator end() const
Non-mutable end iterator.
Definition: DPosition.h:389
CoordinateType * iterator
Definition: DPosition.h:74
Wrapper class for two trees and the corresponding distance.
Definition: HierarchicalClustering.h:185
TreeNode(const PointCoordinate &coord, const PointRef &ref, bool center)
Definition: HierarchicalClustering.h:168
PointCoordinate size() const
Definition: HierarchicalClustering.h:92
Representation of a coordinate in D-dimensional space.
Definition: DPosition.h:52
DPosition< 2, DoubleReal > PointCoordinate
Coordinate of a point to be clustered.
Definition: HierarchicalClustering.h:74
Grid::cell_iterator insertCluster_(const P &p)
Insert new Cluster into grid.
Definition: HierarchicalClustering.h:248
BoundingBox operator|(const BoundingBox &rhs) const
Intersection of bounding box.
Definition: HierarchicalClustering.h:115
static DoubleReal coordDist_(const PointCoordinate &lhs, const PointCoordinate &rhs)
Definition: HierarchicalClustering.h:404
void clusterIndex_(const typename Grid::CellIndex &p)
Perform clustering at given cell index.
Definition: HierarchicalClustering.h:416
TreeNode * right
Definition: HierarchicalClustering.h:189
void tree2Cluster_(const TreeNode *tree, Cluster &cluster)
Recursively add the points of a finished cluster into the hash grid. All points are saved in the leaf...
Definition: HierarchicalClustering.h:341
UInt points
Definition: HierarchicalClustering.h:164
void cluster()
Perform clustering of all existing points.
Definition: HierarchicalClustering.h:230
TreeNode(const PointCoordinate &coord, const BoundingBox &bbox, TreeNode *left, TreeNode *right)
Definition: HierarchicalClustering.h:172
HierarchicalClustering(const PointCoordinate &cluster_dimension)
Constructor.
Definition: HierarchicalClustering.h:210
HashGrid< Cluster > Grid
The hash grid data type.
Definition: HierarchicalClustering.h:147
std::priority_queue< TreeDistance, std::vector< TreeDistance >, std::greater< TreeDistance > > TreeDistanceQueue
Priority queue queue used to find minimum distances.
Definition: HierarchicalClustering.h:203
void addTreeDistance_(TreeNode *tree, ClusterTrees &trees, TreeDistanceQueue &dists)
Add a new tree to the set of trees and distance queue.
Definition: HierarchicalClustering.h:291
Tree node used for clustering.
Definition: HierarchicalClustering.h:158
const bool center
Definition: HierarchicalClustering.h:165
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:144
std::map< typename Grid::CellIndex, std::pair< typename Grid::CellContent *, bool > > ClusterCells
Definition: HierarchicalClustering.h:181
TreeNode * left
Definition: HierarchicalClustering.h:189
TreeNode * left
Definition: HierarchicalClustering.h:163
bool operator>(const TreeDistance &rhs) const
Definition: HierarchicalClustering.h:195
void gridCells5x5_(typename Grid::CellIndex cur, ClusterCells &cells)
Collect all cells used to cluster at given cell index.
Definition: HierarchicalClustering.h:514
TreeNode * right
Definition: HierarchicalClustering.h:163
ConstIterator begin() const
Non-mutable begin iterator.
Definition: DPosition.h:383
Cluster(const BoundingBox &bbox)
Definition: HierarchicalClustering.h:139
TreeDistance(const DoubleReal &distance, TreeNode *left, TreeNode *right)
Definition: HierarchicalClustering.h:191
static bool coordElemGreater_(const PointCoordinate &lhs, const PointCoordinate &rhs)
Definition: HierarchicalClustering.h:394
Bounding box of cluster.
Definition: HierarchicalClustering.h:80
BoundingBox bbox
Definition: HierarchicalClustering.h:137

OpenMS / TOPP release 1.11.1 Documentation generated on Thu Nov 14 2013 11:19:14 using doxygen 1.8.5