OpenMS  3.0.0
FalseDiscoveryRate.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2022.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Andreas Bertsch, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
42 
43 #include <unordered_map>
44 
45 #include <vector>
46 #include <unordered_set>
47 
48 namespace OpenMS
49 {
50 
51  struct ScoreToTgtDecLabelPairs;
52 
77  class OPENMS_DLLAPI FalseDiscoveryRate :
78  public DefaultParamHandler
79  {
80 public:
83 
90  void apply(std::vector<PeptideIdentification>& fwd_ids, std::vector<PeptideIdentification>& rev_ids) const;
91 
98  void apply(std::vector<PeptideIdentification>& id, bool annotate_peptide_fdr = false) const;
99 
106  void apply(std::vector<ProteinIdentification>& fwd_ids, std::vector<ProteinIdentification>& rev_ids) const;
107 
113  void apply(std::vector<ProteinIdentification>& ids) const;
114 
120  void applyEstimated(std::vector<ProteinIdentification>& ids) const;
121 
131  double applyEvaluateProteinIDs(const std::vector<ProteinIdentification>& ids, double pepCutoff = 1.0, UInt fpCutoff = 50, double diffWeight = 0.2) const;
141  double applyEvaluateProteinIDs(const ProteinIdentification& ids, double pepCutoff = 1.0, UInt fpCutoff = 50, double diffWeight = 0.2) const;
142 
152  double applyEvaluateProteinIDs(ScoreToTgtDecLabelPairs& score_to_tgt_dec_fraction_pairs, double pepCutoff = 1.0, UInt fpCutoff = 50, double diffWeight = 0.2) const;
153 
155  void applyBasic(const std::vector<ProteinIdentification> & run_info, std::vector<PeptideIdentification> & ids);
157  void applyBasic(std::vector<PeptideIdentification> & ids, bool higher_score_better, int charge = 0, String identifier = "", bool only_best_per_pep = false);
159  void applyBasic(ConsensusMap & cmap, bool use_unassigned_peptides = true);
161  void applyBasic(ProteinIdentification & id, bool groups_too = true);
162 
172  void applyPickedProteinFDR(ProteinIdentification& id, String decoy_string = "", bool prefix = true, bool groups_too = true);
173 
176  double rocN(const std::vector<PeptideIdentification>& ids, Size fp_cutoff) const;
177 
180  double rocN(const std::vector<PeptideIdentification>& ids, Size fp_cutoff, const String& identifier) const;
181 
184  double rocN(const ConsensusMap& ids, Size fp_cutoff, bool include_unassigned_peptides = false) const;
185 
188  double rocN(const ConsensusMap& ids, Size fp_cutoff, const String& identifier, bool include_unassigned_peptides = false) const;
189 
190  //TODO the next two methods could potentially be merged for speed (they iterate over the same structure)
191  //But since they have different cutoff types and it is more generic, I leave it like this.
193  double diffEstimatedEmpirical(const ScoreToTgtDecLabelPairs& scores_labels, double pepCutoff = 1.0) const;
194 
197  double rocN(const ScoreToTgtDecLabelPairs& scores_labels, Size fpCutoff = 50) const;
198 
207  IdentificationData::ScoreTypeRef applyToObservationMatches(IdentificationData& id_data, IdentificationData::ScoreTypeRef score_ref) const;
208 
213  {
214  public:
218  struct Result
219  {
220  bool success;
222  bool is_prefix;
223  };
224 
231  static Result findDecoyString(const ProteinIdentification& proteins);
232  };
233 private:
234 
237 
239  FalseDiscoveryRate& operator=(const FalseDiscoveryRate&);
240 
242  void calculateFDRs_(std::map<double, double>& score_to_fdr, std::vector<double>& target_scores, std::vector<double>& decoy_scores, bool q_value, bool higher_score_better) const;
243 
245  void handleObservationMatch_(
248  std::vector<double>& target_scores,
249  std::vector<double>& decoy_scores,
250  std::map<IdentificationData::IdentifiedMolecule, bool>& molecule_to_decoy,
251  std::map<IdentificationData::ObservationMatchRef, double>& match_to_score) const;
252 
255  void calculateEstimatedQVal_(std::map<double, double> &scores_to_FDR,
256  ScoreToTgtDecLabelPairs &scores_labels,
257  bool higher_score_better) const;
258 
264  void calculateFDRBasic_(std::map<double,double>& scores_to_FDR, ScoreToTgtDecLabelPairs& scores_labels, bool qvalue, bool higher_score_better) const;
265 
268  double trapezoidal_area_xEqy(double exp1, double exp2, double act1, double act2) const;
269 
271  double trapezoidal_area(double x1, double x2, double y1, double y2) const;
272  };
273 
274 } // namespace OpenMS
DefaultParamHandler.h
OpenMS::TOPPBase
Base class for TOPP applications.
Definition: TOPPBase.h:147
FileHandler.h
FileTypes.h
Size
OpenMS::FalseDiscoveryRate::apply
void apply(std::vector< PeptideIdentification > &fwd_ids, std::vector< PeptideIdentification > &rev_ids) const
Calculates the FDR of two runs, a forward run and a decoy run on peptide level.
OpenMS::IdXMLFile::store
void store(const String &filename, const std::vector< ProteinIdentification > &protein_ids, const std::vector< PeptideIdentification > &peptide_ids, const String &document_id="")
Stores the data in an idXML file.
OpenMS::String
A more convenient string class.
Definition: String.h:58
OpenMS::FalseDiscoveryRate::DecoyStringHelper
Finds decoy strings in ProteinIdentification runs.
Definition: FalseDiscoveryRate.h:212
ConsensusMap.h
OpenMS::IDFilter::updateProteinReferences
static void updateProteinReferences(std::vector< PeptideIdentification > &peptides, const std::vector< ProteinIdentification > &proteins, bool remove_peptides_without_reference=false)
Removes references to missing proteins.
OpenMS::Size
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
IdXMLFile.h
OpenMS::IDFilter::updateProteinGroups
static bool updateProteinGroups(std::vector< ProteinIdentification::ProteinGroup > &groups, const std::vector< ProteinHit > &hits)
Update protein groups after protein hits were filtered.
OPENMS_LOG_WARN
#define OPENMS_LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged.
Definition: LogStream.h:460
OpenMS::ProteinIdentification
Representation of a protein identification run.
Definition: ProteinIdentification.h:70
OpenMS::DefaultParamHandler
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:92
OpenMS::IdXMLFile::load
void load(const String &filename, std::vector< ProteinIdentification > &protein_ids, std::vector< PeptideIdentification > &peptide_ids)
Loads the identifications of an idXML file without identifier.
OPENMS_LOG_FATAL_ERROR
#define OPENMS_LOG_FATAL_ERROR
Macro to be used if fatal error are reported (processing stops)
Definition: LogStream.h:450
OpenMS::FalseDiscoveryRate::DecoyStringHelper::Result::is_prefix
bool is_prefix
on success, was it a prefix or suffix
Definition: FalseDiscoveryRate.h:222
OpenMS
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
OpenMS::IDFilter::removeUnreferencedProteins
static void removeUnreferencedProteins(ConsensusMap &cmap, bool include_unassigned)
OpenMS::FalseDiscoveryRate::DecoyStringHelper::Result::name
String name
on success, what was the decoy string?
Definition: FalseDiscoveryRate.h:221
OpenMS::IdentificationDataInternal::IteratorWrapper
Wrapper that adds operator< to iterators, so they can be used as (part of) keys in maps/sets or multi...
Definition: MetaData.h:45
OpenMS::FalseDiscoveryRate::applyBasic
void applyBasic(const std::vector< ProteinIdentification > &run_info, std::vector< PeptideIdentification > &ids)
simpler reimplementation of the apply function above for PSMs. With charge and identifier info from r...
ProteinIdentification.h
OpenMS::IDFilter::updateHitRanks
static void updateHitRanks(std::vector< IdentificationType > &ids)
Updates the hit ranks on all peptide or protein IDs.
Definition: IDFilter.h:742
OpenMS::DefaultParamHandler::setParameters
void setParameters(const Param &param)
Sets the parameters.
OpenMS::DefaultParamHandler::getDefaults
const Param & getDefaults() const
Non-mutable access to the default parameters.
OpenMS::IDFilter::countHits
static Size countHits(const std::vector< IdentificationType > &ids)
Returns the total number of peptide/protein hits in a vector of peptide/protein identifications.
Definition: IDFilter.h:614
OpenMS::ConsensusMap
A container for consensus elements.
Definition: ConsensusMap.h:82
OpenMS::UInt
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
OpenMS::FalseDiscoveryRate
Calculates false discovery rates (FDR) from identifications.
Definition: FalseDiscoveryRate.h:77
main
int main(int argc, const char **argv)
Definition: INIFileEditor.cpp:71
OpenMS::IdentificationData
Representation of spectrum identification results and associated data.
Definition: IdentificationData.h:94
OpenMS::FalseDiscoveryRate::DecoyStringHelper::Result
Finds the most common decoy string in the accessions of proteins. Checks for suffix and prefix and so...
Definition: FalseDiscoveryRate.h:218
OpenMS::Param::copy
Param copy(const std::string &prefix, bool remove_prefix=false) const
Returns a new Param object containing all entries that start with prefix.
OpenMS::ScoreToTgtDecLabelPairs
Definition: IDScoreGetterSetter.h:55
OpenMS::IDFilter::removeEmptyIdentifications
static void removeEmptyIdentifications(std::vector< IdentificationType > &ids)
Removes peptide or protein identifications that have no hits in them.
Definition: IDFilter.h:828
OpenMS::Param
Management and storage of parameters / INI files.
Definition: Param.h:69
IdentificationData.h
OpenMS::StringUtils::prefix
static String prefix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:147
IDFilter.h
OpenMS::FalseDiscoveryRate::DecoyStringHelper::Result::success
bool success
did more than 30% of proteins have the *same* prefix or suffix
Definition: FalseDiscoveryRate.h:220
OpenMS::IDFilter::filterHitsByScore
static void filterHitsByScore(std::vector< IdentificationType > &ids, double threshold_score)
Filters peptide or protein identifications according to the score of the hits.
Definition: IDFilter.h:840
OpenMS::Exception::MissingInformation
Not all required information provided.
Definition: Exception.h:186
OPENMS_LOG_INFO
#define OPENMS_LOG_INFO
Macro if a information, e.g. a status should be reported.
Definition: LogStream.h:465
PeptideIdentification.h
FalseDiscoveryRate.h
StandardTypes.h
TOPPBase.h
OpenMS::IdXMLFile
Used to load and store idXML files.
Definition: IdXMLFile.h:68