Home  · Classes  · Annotated Classes  · Modules  · Members  · Namespaces  · Related Pages
IDMapper.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2013.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Marc Sturm, Hendrik Weisser $
33 // --------------------------------------------------------------------------
34 
35 #ifndef OPENMS_ANALYSIS_ID_IDMAPPER_H
36 #define OPENMS_ANALYSIS_ID_IDMAPPER_H
37 
42 
43 #include <algorithm>
44 #include <limits>
45 
46 namespace OpenMS
47 {
62  class OPENMS_DLLAPI IDMapper :
63  public DefaultParamHandler
64  {
65 public:
66  enum Measure {MEASURE_PPM = 0, MEASURE_DA};
67 
69  IDMapper();
70 
72  IDMapper(const IDMapper & cp);
73 
75  IDMapper & operator=(const IDMapper & rhs);
76 
89  template <typename PeakType>
90  void annotate(MSExperiment<PeakType> & map, const std::vector<PeptideIdentification> & ids, const std::vector<ProteinIdentification> & protein_ids)
91  {
92  checkHits_(ids);
93 
94  //append protein identifications
95  map.getProteinIdentifications().insert(map.getProteinIdentifications().end(), protein_ids.begin(), protein_ids.end());
96 
97  //store mapping of scan RT to index
98  std::multimap<DoubleReal, Size> experiment_precursors;
99  for (Size i = 0; i < map.size(); i++)
100  {
101  experiment_precursors.insert(std::make_pair(map[i].getRT(), i));
102  }
103 
104  //store mapping of identification RT to index
105  std::multimap<DoubleReal, Size> identifications_precursors;
106  for (Size i = 0; i < ids.size(); i++)
107  {
108  identifications_precursors.insert(std::make_pair(ids[i].getMetaValue("RT"), i));
109  }
110 
111  //calculate the actual mapping
112  std::multimap<DoubleReal, Size>::iterator experiment_iterator = experiment_precursors.begin();
113  std::multimap<DoubleReal, Size>::iterator identifications_iterator = identifications_precursors.begin();
114  Size matches(0);
115  while (experiment_iterator != experiment_precursors.end() && identifications_iterator != identifications_precursors.end())
116  {
117  while (identifications_iterator != identifications_precursors.end())
118  {
119  // testing whether the retention times are within the precision threshold
120  if (fabs(experiment_iterator->first - identifications_iterator->first) < rt_tolerance_)
121  {
122  // testing whether the m/z fits
123  if (!map[experiment_iterator->second].getPrecursors().empty())
124  {
125  if (fabs((DoubleReal)(ids[identifications_iterator->second].getMetaValue("MZ")) - map[experiment_iterator->second].getPrecursors()[0].getMZ()) < mz_tolerance_)
126  {
127  if (!(ids[identifications_iterator->second].empty()))
128  {
129  map[experiment_iterator->second].getPeptideIdentifications().push_back(ids[identifications_iterator->second]);
130  ++matches;
131  }
132  }
133  }
134  }
135  ++identifications_iterator;
136  }
137  identifications_iterator = identifications_precursors.begin();
138  ++experiment_iterator;
139  }
140 
141  // some statistics output
142  LOG_INFO << "Unassigned peptides: " << ids.size() - matches << "\n"
143  << "Peptides assigned to a precursor: " << matches << std::endl;
144 
145  }
146 
164  template <typename FeatureType>
165  void annotate(FeatureMap<FeatureType> & map, const std::vector<PeptideIdentification> & ids, const std::vector<ProteinIdentification> & protein_ids, bool use_centroid_rt = false, bool use_centroid_mz = false)
166  {
167  // std::cout << "Starting annotation..." << std::endl;
168  checkHits_(ids);
169 
170  // append protein identifications
171  map.getProteinIdentifications().insert(map.getProteinIdentifications().end(), protein_ids.begin(), protein_ids.end());
172 
173  // check if all features have at least one convex hull
174  // if not, use the centroid and the given tolerances
175  if (!(use_centroid_rt && use_centroid_mz))
176  {
177  for (typename FeatureMap<FeatureType>::Iterator f_it = map.begin(); f_it != map.end(); ++f_it)
178  {
179  if (f_it->getConvexHulls().empty())
180  {
181  use_centroid_rt = true;
182  use_centroid_mz = true;
183  LOG_WARN << "IDMapper warning: at least one feature has no convex hull - using centroid coordinates for matching" << std::endl;
184  break;
185  }
186  }
187  }
188 
189  bool use_avg_mass = false; // use avg. peptide masses for matching?
190  if (use_centroid_mz && (param_.getValue("mz_reference") == "peptide"))
191  {
192  // if possible, check which m/z value is reported for features,
193  // so the appropriate peptide mass can be used for matching
194  use_avg_mass = checkMassType_(map.getDataProcessing());
195  }
196 
197  // calculate feature bounding boxes only once:
198  std::vector<DBoundingBox<2> > boxes;
199  DoubleReal min_rt = std::numeric_limits<DoubleReal>::max(),
200  max_rt = -std::numeric_limits<DoubleReal>::max();
201  // std::cout << "Precomputing bounding boxes..." << std::endl;
202  boxes.reserve(map.size());
203  for (typename FeatureMap<FeatureType>::Iterator f_it = map.begin();
204  f_it != map.end(); ++f_it)
205  {
206  DBoundingBox<2> box;
207  if (!(use_centroid_rt && use_centroid_mz))
208  {
209  box = f_it->getConvexHull().getBoundingBox();
210  }
211  if (use_centroid_rt)
212  {
213  box.setMinX(f_it->getRT());
214  box.setMaxX(f_it->getRT());
215  }
216  if (use_centroid_mz)
217  {
218  box.setMinY(f_it->getMZ());
219  box.setMaxY(f_it->getMZ());
220  }
221  increaseBoundingBox_(box);
222  boxes.push_back(box);
223 
224  min_rt = std::min(min_rt, box.minPosition().getX());
225  max_rt = std::max(max_rt, box.maxPosition().getX());
226  }
227 
228  // hash bounding boxes of features by RT:
229  // RT range is partitioned into slices (bins) of 1 second; every feature
230  // that overlaps a certain slice is hashed into the corresponding bin
231  std::vector<std::vector<SignedSize> > hash_table;
232  // make sure the RT hash table has indices >= 0 and doesn't waste space
233  // in the beginning:
234  SignedSize offset(0);
235 
236  if (map.size() > 0)
237  {
238  // std::cout << "Setting up hash table..." << std::endl;
239  offset = SignedSize(floor(min_rt));
240  // this only works if features were found
241  hash_table.resize(SignedSize(floor(max_rt)) - offset + 1);
242  for (Size index = 0; index < boxes.size(); ++index)
243  {
244  const DBoundingBox<2> & box = boxes[index];
245  for (SignedSize i = SignedSize(floor(box.minPosition().getX()));
246  i <= SignedSize(floor(box.maxPosition().getX())); ++i)
247  {
248  hash_table[i - offset].push_back(index);
249  }
250  }
251  }
252  else
253  {
254  LOG_WARN << "IDMapper received an empty FeatureMap! All peptides are mapped as 'unassigned'!" << std::endl;
255  }
256 
257  // for statistics:
258  Size matches_none = 0, matches_single = 0, matches_multi = 0;
259 
260  // std::cout << "Finding matches..." << std::endl;
261  // iterate over peptide IDs:
262  for (std::vector<PeptideIdentification>::const_iterator id_it =
263  ids.begin(); id_it != ids.end(); ++id_it)
264  {
265  // std::cout << "Peptide ID: " << id_it - ids.begin() << std::endl;
266 
267  if (id_it->getHits().empty()) continue;
268 
269  DoubleList mz_values;
270  DoubleReal rt_value;
271  IntList charges;
272  getIDDetails_(*id_it, rt_value, mz_values, charges, use_avg_mass);
273 
274  if ((rt_value < min_rt) || (rt_value > max_rt)) // RT out of bounds
275  {
276  map.getUnassignedPeptideIdentifications().push_back(*id_it);
277  ++matches_none;
278  continue;
279  }
280 
281  // iterate over candidate features:
282  Size index = SignedSize(floor(rt_value)) - offset;
283  Size matching_features = 0;
284  for (std::vector<SignedSize>::iterator hash_it =
285  hash_table[index].begin(); hash_it != hash_table[index].end();
286  ++hash_it)
287  {
288  Feature & feat = map[*hash_it];
289 
290  // need to check the charge state?
291  bool check_charge = !ignore_charge_;
292  if (check_charge && (mz_values.size() == 1)) // check now
293  {
294  if (!charges.contains(feat.getCharge())) continue;
295  check_charge = false; // don't need to check later
296  }
297 
298  // iterate over m/z values (only one if "mz_ref." is "precursor"):
299  Size index = 0;
300  for (DoubleList::iterator mz_it = mz_values.begin();
301  mz_it != mz_values.end(); ++mz_it, ++index)
302  {
303  if (check_charge && (charges[index] != feat.getCharge()))
304  {
305  continue; // charge states need to match
306  }
307 
308  DPosition<2> id_pos(rt_value, *mz_it);
309  if (boxes[*hash_it].encloses(id_pos)) // potential match
310  {
311  if (use_centroid_mz)
312  {
313  // only one m/z value to check, which was alredy incorporated
314  // into the overall bounding box -> success!
315  feat.getPeptideIdentifications().push_back(*id_it);
316  ++matching_features;
317  break; // "mz_it" loop
318  }
319  // else: check all the mass traces
320  bool found_match = false;
321  for (std::vector<ConvexHull2D>::iterator ch_it =
322  feat.getConvexHulls().begin(); ch_it !=
323  feat.getConvexHulls().end(); ++ch_it)
324  {
325  DBoundingBox<2> box = ch_it->getBoundingBox();
326  if (use_centroid_rt)
327  {
328  box.setMinX(feat.getRT());
329  box.setMaxX(feat.getRT());
330  }
331  increaseBoundingBox_(box);
332  if (box.encloses(id_pos)) // success!
333  {
334  feat.getPeptideIdentifications().push_back(*id_it);
335  ++matching_features;
336  found_match = true;
337  break; // "ch_it" loop
338  }
339  }
340  if (found_match) break; // "mz_it" loop
341  }
342  }
343  }
344  if (matching_features == 0)
345  {
346  map.getUnassignedPeptideIdentifications().push_back(*id_it);
347  ++matches_none;
348  }
349  else if (matching_features == 1) ++matches_single;
350  else ++matches_multi;
351  }
352 
353  // some statistics output
354  LOG_INFO << "Unassigned peptides: " << matches_none << "\n"
355  << "Peptides assigned to exactly one feature: " << matches_single << "\n"
356  << "Peptides assigned to multiple features: " << matches_multi << "\n"
357  << map.getAnnotationStatistics()
358  << std::endl;
359 
360  }
361 
375  void annotate(ConsensusMap & map, const std::vector<PeptideIdentification> & ids, const std::vector<ProteinIdentification> & protein_ids, bool measure_from_subelements = false);
376 
377 protected:
378  void updateMembers_();
379 
388 
392  DoubleReal getAbsoluteMZTolerance_(const DoubleReal mz) const;
393 
395  bool isMatch_(const DoubleReal rt_distance, const DoubleReal mz_theoretical, const DoubleReal mz_observed) const;
396 
398  void checkHits_(const std::vector<PeptideIdentification> & ids) const;
399 
403  void getIDDetails_(const PeptideIdentification & id, DoubleReal & rt_pep, DoubleList & mz_values, IntList & charges, bool use_avg_mass = false) const;
404 
406  void increaseBoundingBox_(DBoundingBox<2> & box);
407 
410  bool checkMassType_(const std::vector<DataProcessing> & processing) const;
411 
412  };
413 
414 } // namespace OpenMS
415 
416 #endif // OPENMS_ANALYSIS_ID_IDMAPPER_H
bool encloses(const PositionType &position) const
Checks whether this range contains a certain point.
Definition: DBoundingBox.h:159
const ChargeType & getCharge() const
Non-mutable access to charge state.
void setMaxX(CoordinateType const c)
Mutator for min_ coordinate of the larger point.
Definition: DIntervalBase.h:278
const std::vector< PeptideIdentification > & getPeptideIdentifications() const
returns a const reference to the PeptideIdentification vector
bool contains(Int s) const
Returns if a string is contains in the list.
void setMinY(CoordinateType const c)
Mutator for max_ coordinate of the smaller point.
Definition: DIntervalBase.h:271
Size size() const
Definition: MSExperiment.h:117
const std::vector< ProteinIdentification > & getProteinIdentifications() const
non-mutable access to the protein identifications
Definition: FeatureMap.h:410
#define LOG_INFO
Macro if a information, e.g. a status should be reported.
Definition: LogStream.h:455
const std::vector< ProteinIdentification > & getProteinIdentifications() const
returns a const reference to the protein ProteinIdentification vector
AnnotationStatistics getAnnotationStatistics() const
Definition: FeatureMap.h:520
void annotate(FeatureMap< FeatureType > &map, const std::vector< PeptideIdentification > &ids, const std::vector< ProteinIdentification > &protein_ids, bool use_centroid_rt=false, bool use_centroid_mz=false)
Mapping method for feature maps.
Definition: IDMapper.h:165
A container for features.
Definition: FeatureMap.h:111
void setMaxY(CoordinateType const c)
Mutator for max_ coordinate of the larger point.
Definition: DIntervalBase.h:285
Annotates an MSExperiment, FeatureMap or ConsensusMap with peptide identifications.
Definition: IDMapper.h:62
const std::vector< PeptideIdentification > & getUnassignedPeptideIdentifications() const
non-mutable access to the unassigned peptide identifications
Definition: FeatureMap.h:428
Measure
Definition: IDMapper.h:66
A container for consensus elements.
Definition: ConsensusMap.h:60
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:151
CoordinateType getRT() const
Returns the RT coordinate (index 0)
Definition: Peak2D.h:203
#define LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged...
Definition: LogStream.h:451
const std::vector< DataProcessing > & getDataProcessing() const
returns a const reference to the description of the applied data processing
Definition: FeatureMap.h:446
DoubleReal mz_tolerance_
Allowed m/z deviation.
Definition: IDMapper.h:383
const std::vector< ConvexHull2D > & getConvexHulls() const
Non-mutable access to the convex hulls.
bool ignore_charge_
Ignore charge states during matching?
Definition: IDMapper.h:387
Base::iterator Iterator
Definition: FeatureMap.h:154
An LC-MS feature.
Definition: Feature.h:66
Representation of a mass spectrometry experiment.
Definition: MSExperiment.h:68
PositionType const & minPosition() const
Accessor to minimum position.
Definition: DIntervalBase.h:121
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:144
bool empty() const
Definition: MSExperiment.h:127
void setMinX(CoordinateType const c)
Mutator for min_ coordinate of the smaller point.
Definition: DIntervalBase.h:264
Measure measure_
Measure used for m/z.
Definition: IDMapper.h:385
PositionType const & maxPosition() const
Accessor to maximum position.
Definition: DIntervalBase.h:127
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:90
A D-dimensional bounding box.
Definition: DBoundingBox.h:51
DoubleReal rt_tolerance_
Allowed RT deviation.
Definition: IDMapper.h:381
void annotate(MSExperiment< PeakType > &map, const std::vector< PeptideIdentification > &ids, const std::vector< ProteinIdentification > &protein_ids)
Mapping method for peak maps.
Definition: IDMapper.h:90
DoubleReal list.
Definition: DoubleList.h:56
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:63
Int list.
Definition: IntList.h:56

OpenMS / TOPP release 1.11.1 Documentation generated on Thu Nov 14 2013 11:19:15 using doxygen 1.8.5