OpenMS  3.0.0
IDMergerAlgorithm.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2022.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Julianus Pfeuffer $
32 // $Authors: Julianus Pfeuffer $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
42 
43 #include <unordered_set>
44 
45 namespace OpenMS
46 {
47 
48  //TODO add params for checking consistency (i.e. how strict to check)
49  //TODO add another subclass that does score-aware merging? (i.e. only keep best per peptide[sequence])
50 
61  class OPENMS_DLLAPI IDMergerAlgorithm:
62  public DefaultParamHandler,
63  public ProgressLogger
64  {
65  public:
66  explicit IDMergerAlgorithm (const String& runIdentifier = "merged");
67 
70  void insertRuns(std::vector<ProteinIdentification>&& prots,
71  std::vector<PeptideIdentification>&& peps);
72  void insertRuns(const std::vector<ProteinIdentification>& prots,
73  const std::vector<PeptideIdentification>& peps);
74 
75  //TODO add methods to just insert prots or just peps. Especially makes sense if you do re-indexing anyway,
76  // then you do not need the proteins. But then we need origin information. Either externally in form of a
77  // String or StringList (like the one from ProteinID.getPrimaryMSRunPath). Or by having the file annotated
78  // at the PeptideID (with getBasename maybe?)
79  // Current solution would be to clear the ProteinIdentification if you do not need the proteins and add all the
80  // necessary information about origin(s) to this ProteinIdentification.
81 
83  void returnResultsAndClear(ProteinIdentification& prots,
84  std::vector<PeptideIdentification>& peps);
85 
86  private:
87 
89  String getNewIdentifier_() const;
90 
92  static void copySearchParams_(const ProteinIdentification& from, ProteinIdentification& to);
93 
99  bool checkOldRunConsistency_(
100  const std::vector<ProteinIdentification>& protRuns,
101  const String& experiment_type) const;
102 
109  bool checkOldRunConsistency_(
110  const std::vector<ProteinIdentification>& protRuns,
111  const ProteinIdentification& ref,
112  const String& experiment_type) const;
113 
116  void insertProteinIDs_(
117  std::vector<ProteinIdentification>&& old_protRuns
118  );
119 
123  void updateAndMovePepIDs_(
124  std::vector<PeptideIdentification>&& pepIDs,
125  const std::map<String, Size>& runID_to_runIdx,
126  const std::vector<StringList>& originFiles,
127  bool annotate_origin
128  );
129 
130 
131  void movePepIDsAndRefProteinsToResultFaster_(
132  std::vector<PeptideIdentification>&& pepIDs,
133  std::vector<ProteinIdentification>&& old_protRuns
134  );
135 
138 
140  std::vector<PeptideIdentification> pep_result_;
141 
142  static size_t accessionHash_(const ProteinHit& p){
143  return std::hash<String>()(p.getAccession());
144  }
145  static bool accessionEqual_(const ProteinHit& p1, const ProteinHit& p2){
146  return p1.getAccession() == p2.getAccession();
147  }
148  using hash_type = std::size_t (*)(const ProteinHit&);
149  using equal_type = bool (*)(const ProteinHit&, const ProteinHit&);
150  std::unordered_set<ProteinHit, hash_type, equal_type> collected_protein_hits_;
151 
153  bool filled_ = false;
154 
156  std::map<String, Size> file_origin_to_idx_;
157 
160  };
161 } // namespace OpenMS
DefaultParamHandler.h
OpenMS::SpectrumAlignment::getSpectrumAlignment
void getSpectrumAlignment(std::vector< std::pair< Size, Size > > &alignment, const SpectrumType1 &s1, const SpectrumType2 &s2) const
Definition: SpectrumAlignment.h:88
OpenMS::Math::GaussFitter
Implements a fitter for Gaussian functions.
Definition: GaussFitter.h:60
OpenMS::PeptideHit::getCharge
Int getCharge() const
returns the charge of the peptide
OpenMS::TOPPBase
Base class for TOPP applications.
Definition: TOPPBase.h:147
FileHandler.h
Size
OpenMS::Normalizer
Normalizes the peak intensities spectrum-wise.
Definition: Normalizer.h:57
OpenMS::DPosition::setX
void setX(CoordinateType c)
Name mutator for the first dimension. Only for DPosition<2>, for visualization.
Definition: DPosition.h:171
OpenMS::ProteinIdentification::getIndistinguishableProteins
const std::vector< ProteinGroup > & getIndistinguishableProteins() const
Returns the indistinguishable proteins.
OpenMS::OMSFile
This class supports reading and writing of OMS files.
Definition: OMSFile.h:48
double
OpenMS::Param::setValue
void setValue(const std::string &key, const ParamValue &value, const std::string &description="", const std::vector< std::string > &tags=std::vector< std::string >())
Sets a value.
OpenMS::Math::median
static double median(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the median of a range of values.
Definition: StatisticFunctions.h:138
OpenMS::IdXMLFile::store
void store(const String &filename, const std::vector< ProteinIdentification > &protein_ids, const std::vector< PeptideIdentification > &peptide_ids, const String &document_id="")
Stores the data in an idXML file.
OpenMS::ProteinIdentification::getSignificanceThreshold
double getSignificanceThreshold() const
Returns the protein significance threshold value.
OpenMS::PeptideHit::getSequence
const AASequence & getSequence() const
returns the peptide sequence without trailing or following spaces
OpenMS::IDMergerAlgorithm::prot_result_
ProteinIdentification prot_result_
the resulting new Protein IDs
Definition: IDMergerAlgorithm.h:137
GaussFitter.h
IDMergerAlgorithm.h
OpenMS::File::basename
static String basename(const String &file)
OpenMS::FileHandler::getTypeByFileName
static FileTypes::Type getTypeByFileName(const String &filename)
Determines the file type from a file name.
OpenMS::IDMergerAlgorithm::pep_result_
std::vector< PeptideIdentification > pep_result_
the resulting new Peptide IDs
Definition: IDMergerAlgorithm.h:140
OpenMS::Math::GaussFitter::GaussFitResult
struct of parameters of a Gaussian distribution
Definition: GaussFitter.h:65
OpenMS::MzMLFile
File adapter for MzML files.
Definition: MzMLFile.h:57
OpenMS::String
A more convenient string class.
Definition: String.h:58
OpenMS::Math::GaussFitter::fit
GaussFitResult fit(std::vector< DPosition< 2 > > &points) const
Fits a Gaussian distribution to the given data points.
OpenMS::IDMergerAlgorithm::file_origin_to_idx_
std::map< String, Size > file_origin_to_idx_
to keep track of the mzML origins of spectra
Definition: IDMergerAlgorithm.h:156
MzMLFile.h
Int
ConsensusMap.h
OpenMS::Size
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
OpenMS::AASequence::getMonoWeight
double getMonoWeight(Residue::ResidueType type=Residue::Full, Int charge=0) const
IdXMLFile.h
OpenMS::ProteinHit
Representation of a protein hit.
Definition: ProteinHit.h:58
OpenMS::Residue::Full
with N-terminus and C-terminus
Definition: Residue.h:153
OpenMS::ProteinIdentification::findHit
std::vector< ProteinHit >::iterator findHit(const String &accession)
Finds a protein hit by accession (returns past-the-end iterator if not found)
StatisticFunctions.h
OpenMS::AASequence::toString
String toString() const
returns the peptide as string with modifications embedded in brackets
OpenMS::Normalizer::filterSpectrum
void filterSpectrum(SpectrumType &spectrum) const
Workhorse of this class.
Definition: Normalizer.h:86
OpenMS::FileTypes::UNKNOWN
Unknown file extension.
Definition: FileTypes.h:58
OpenMS::ProteinIdentification
Representation of a protein identification run.
Definition: ProteinIdentification.h:70
Constants.h
OpenMS::IDMergerAlgorithm
Creates a new Protein ID run into which other runs can be inserted. Creates union of protein hits but...
Definition: IDMergerAlgorithm.h:61
OpenMS::IDMapper::annotate
void annotate(PeakMap &map, const std::vector< PeptideIdentification > &peptide_ids, const std::vector< ProteinIdentification > &protein_ids, const bool clear_ids=false, const bool map_ms1=false)
Mapping method for peak maps.
OpenMS::DefaultParamHandler
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:92
OpenMS::IdXMLFile::load
void load(const String &filename, std::vector< ProteinIdentification > &protein_ids, std::vector< PeptideIdentification > &peptide_ids)
Loads the identifications of an idXML file without identifier.
OpenMS::Exception::InvalidParameter
Exception indicating that an invalid parameter was handed over to an algorithm.
Definition: Exception.h:339
OpenMS::SpectrumAlignment
Aligns the peaks of two sorted spectra Method 1: Using a banded (width via 'tolerance' parameter) ali...
Definition: SpectrumAlignment.h:67
OpenMS::IDMergerAlgorithm::accessionHash_
static size_t accessionHash_(const ProteinHit &p)
Definition: IDMergerAlgorithm.h:142
OpenMS::ProteinHit::getAccession
const String & getAccession() const
returns the accession of the protein
OMSFile.h
OpenMS
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
OpenMS::MSExperiment::Iterator
std::vector< SpectrumType >::iterator Iterator
Mutable iterator.
Definition: MSExperiment.h:104
OpenMS::Math::Histogram
Representation of a histogram.
Definition: Histogram.h:63
Exception.h
OpenMS::IdentificationData::merge
RefTranslator merge(const IdentificationData &other)
Merge in data from another instance.
OpenMS::StringUtils::remove
static String & remove(String &this_s, char what)
Definition: StringUtilsSimple.h:595
OpenMS::ProgressLogger
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:52
OpenMS::MetaInfoInterface::setMetaValue
void setMetaValue(const String &name, const DataValue &value)
Sets the DataValue corresponding to a name.
OpenMS::DataValue
Class to hold strings, numeric values, lists of strings and lists of numeric values.
Definition: DataValue.h:58
ProgressLogger.h
OpenMS::MzMLFile::load
void load(const String &filename, PeakMap &map)
Loads a map from a MzML file. Spectra and chromatograms are sorted by default (this can be disabled u...
OpenMS::ProteinIdentification::getHits
const std::vector< ProteinHit > & getHits() const
Returns the protein hits.
OpenMS::IDMergerAlgorithm::hash_type
std::size_t(*)(const ProteinHit &) hash_type
Definition: IDMergerAlgorithm.h:148
ProteinIdentification.h
OpenMS::FileTypes::Type
Type
Actual file types enum.
Definition: FileTypes.h:56
OPENMS_LOG_DEBUG
#define OPENMS_LOG_DEBUG
Macro for general debugging information.
Definition: LogStream.h:470
OpenMS::FileHandler::getType
static FileTypes::Type getType(const String &filename)
Tries to determine the file type (by name or content)
OpenMS::IDMergerAlgorithm::collected_protein_hits_
std::unordered_set< ProteinHit, hash_type, equal_type > collected_protein_hits_
Definition: IDMergerAlgorithm.h:150
OpenMS::OMSFile::store
void store(const String &filename, const IdentificationData &id_data)
Write out an IdentificationData object to SQL-based OMS file.
OpenMS::DefaultParamHandler::setParameters
void setParameters(const Param &param)
Sets the parameters.
Normalizer.h
OpenMS::DPosition::setY
void setY(CoordinateType c)
Name mutator for the second dimension. Only for DPosition<2>, for visualization.
Definition: DPosition.h:178
OpenMS::TheoreticalSpectrumGenerator::getSpectrum
virtual void getSpectrum(PeakSpectrum &spec, const AASequence &peptide, Int min_charge, Int max_charge, Int precursor_charge=0) const
OpenMS::DefaultParamHandler::getParameters
const Param & getParameters() const
Non-mutable access to the parameters.
OpenMS::PeptideHit::extractProteinAccessionsSet
std::set< String > extractProteinAccessionsSet() const
extracts the set of non-empty protein accessions from peptide evidences
OpenMS::IDMergerAlgorithm::id_
String id_
the new identifier string
Definition: IDMergerAlgorithm.h:159
OpenMS::StringList
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:70
OpenMS::DPosition< 2 >
OpenMS::UInt
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
IDMapper.h
ExperimentalDesign.h
main
int main(int argc, const char **argv)
Definition: INIFileEditor.cpp:71
OpenMS::Math::GaussFitter::setInitialParameters
void setInitialParameters(const GaussFitResult &result)
sets the initial parameters used by the fit method as initial guess for the Gaussian
MSExperiment.h
Histogram.h
OpenMS::TheoreticalSpectrumGenerator
Generates theoretical spectra for peptides with various options.
Definition: TheoreticalSpectrumGenerator.h:68
OpenMS::IdentificationData
Representation of spectrum identification results and associated data.
Definition: IdentificationData.h:94
OpenMS::PeptideIdentification
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:63
OpenMS::Math::mean
static double mean(IteratorType begin, IteratorType end)
Calculates the mean of a range of values.
Definition: StatisticFunctions.h:120
OpenMS::IDMergerAlgorithm::equal_type
bool(*)(const ProteinHit &, const ProteinHit &) equal_type
Definition: IDMergerAlgorithm.h:149
OpenMS::FileTypes::nameToType
static Type nameToType(const String &name)
OpenMS::Param
Management and storage of parameters / INI files.
Definition: Param.h:69
OpenMS::ProteinIdentification::getScoreType
const String & getScoreType() const
Returns the protein score type.
TheoreticalSpectrumGenerator.h
OpenMS::ProteinIdentification::isHigherScoreBetter
bool isHigherScoreBetter() const
Returns true if a higher score represents a better score.
SpectrumAlignment.h
OpenMS::IDMapper
Annotates an MSExperiment, FeatureMap or ConsensusMap with peptide identifications.
Definition: IDMapper.h:64
OpenMS::MSSpectrum
The representation of a 1D spectrum.
Definition: MSSpectrum.h:66
OpenMS::ProteinIdentification::getProteinGroups
const std::vector< ProteinGroup > & getProteinGroups() const
Returns the protein groups.
OpenMS::Exception::UnableToFit
Exception used if an error occurred while fitting a model to a given dataset.
Definition: Exception.h:688
StandardTypes.h
File.h
OpenMS::OMSFile::load
void load(const String &filename, IdentificationData &id_data)
Read in a OMS file and construct an IdentificationData object.
TOPPBase.h
OpenMS::Math::absdev
static double absdev(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the absolute deviation of a range of values.
Definition: StatisticFunctions.h:336
MSSpectrum.h
OpenMS::IDMergerAlgorithm::accessionEqual_
static bool accessionEqual_(const ProteinHit &p1, const ProteinHit &p2)
Definition: IDMergerAlgorithm.h:145
OpenMS::IdXMLFile
Used to load and store idXML files.
Definition: IdXMLFile.h:68
OpenMS::FileTypes::OMS
OpenMS database file.
Definition: FileTypes.h:114
OpenMS::Math::sd
static double sd(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the standard deviation of a range of values.
Definition: StatisticFunctions.h:321
OpenMS::PeptideHit
Representation of a peptide hit.
Definition: PeptideHit.h:55