OpenMS  3.0.0
IdentificationDataConverter.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2022.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Hendrik Weisser $
32 // $Authors: Hendrik Weisser $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
39 #include <OpenMS/FORMAT/MzTab.h>
42 
43 namespace OpenMS
44 {
45  class FeatureMap;
46 
47  class OPENMS_DLLAPI IdentificationDataConverter
48  {
49  public:
50 
52  static void importIDs(IdentificationData& id_data,
53  const std::vector<ProteinIdentification>& proteins,
54  const std::vector<PeptideIdentification>& peptides);
55 
61  static void exportIDs(const IdentificationData& id_data,
62  std::vector<ProteinIdentification>& proteins,
63  std::vector<PeptideIdentification>& peptides,
64  bool export_ids_wo_scores = false);
65 
67  static MzTab exportMzTab(const IdentificationData& id_data);
68 
70  static void importSequences(IdentificationData& id_data,
71  const std::vector<FASTAFile::FASTAEntry>& fasta,
74  const String& decoy_pattern = "");
75 
77  static void exportParentMatches(
78  const IdentificationData::ParentMatches& parent_matches, PeptideHit& hit);
79 
86  static void importFeatureIDs(FeatureMap& features, bool clear_original = true);
87 
94  static void exportFeatureIDs(FeatureMap& features, bool clear_original = true);
95 
96  protected:
97 
98  using StepOpt = std::optional<IdentificationData::ProcessingStepRef>;
99 
102  {
103  bool operator()(const StepOpt& left, const StepOpt& right) const
104  {
105  // @TODO: should runs without associated step go first or last?
106  if (!left) return bool(right);
107  if (!right) return false;
108  return **left < **right;
109  }
110  };
111 
114  {
116  const PeptideIdentification& right) const
117  {
118  // @TODO: should IDs without RT go first or last?
119  if (left.hasRT())
120  {
121  if (right.hasRT())
122  {
123  if (right.getRT() != left.getRT())
124  {
125  return left.getRT() < right.getRT();
126  } // else: compare by m/z (below)
127  }
128  else
129  {
130  return false;
131  }
132  }
133  else if (right.hasRT())
134  {
135  return true;
136  }
137  // no RTs or same RTs -> try to compare by m/z:
138  if (left.hasMZ())
139  {
140  if (right.hasMZ())
141  {
142  return left.getMZ() < right.getMZ();
143  }
144  else
145  {
146  return false;
147  }
148  }
149  // if both PI's have nothing, return false (to ensure 'x < x' is false for strict weak ordering)
150  return right.hasMZ();
151  }
152  };
153 
155  template <typename MzTabSectionRow>
158  std::vector<MzTabSectionRow>& output,
159  std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
160  {
161  MzTabSectionRow row;
162  row.accession.set(parent.accession);
163  exportStepsAndScoresToMzTab_(parent.steps_and_scores, row.search_engine,
164  row.best_search_engine_score, score_map);
165  row.description.set(parent.description);
166  row.coverage.set(parent.coverage);
167  if (!parent.sequence.empty())
168  {
169  MzTabOptionalColumnEntry opt_seq;
170  opt_seq.first = "opt_sequence";
171  opt_seq.second.set(parent.sequence);
172  row.opt_.push_back(opt_seq);
173  }
174  output.push_back(row);
175  }
176 
178  template <typename MzTabSectionRow, typename IdentSeq>
180  const IdentSeq& identified, std::vector<MzTabSectionRow>& output,
181  std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
182  {
183  MzTabSectionRow row;
184  // @TODO: handle modifications properly
185  row.sequence.set(identified.sequence.toString());
186  exportStepsAndScoresToMzTab_(identified.steps_and_scores,
187  row.search_engine,
188  row.best_search_engine_score, score_map);
189  if (identified.parent_matches.empty()) // no parent information given
190  {
191  // row.unique.set(false); // leave this unset?
192  output.push_back(row);
193  }
194  else // generate entries (with duplicated data) for every accession
195  {
196  // in mzTab, "unique" means "peptide is unique for this protein"
197  row.unique.set(identified.parent_matches.size() == 1);
198  for (const auto& match_pair : identified.parent_matches)
199  {
200  row.accession.set(match_pair.first->accession);
201  for (const IdentificationData::ParentMatch& match :
202  match_pair.second)
203  {
204  MzTabSectionRow copy = row;
205  addMzTabMoleculeParentContext_(match, copy);
206  output.push_back(copy);
207  }
208  }
209  }
210  }
211 
213  template <typename MzTabSectionRow>
215  const String& sequence,
216  const IdentificationData::ObservationMatch& match, double calc_mass,
217  std::vector<MzTabSectionRow>& output,
218  std::map<IdentificationData::ScoreTypeRef, Size>& score_map,
219  std::map<IdentificationData::InputFileRef, Size>& file_map)
220  {
221  MzTabSectionRow xsm; // PSM or OSM
222  // @TODO: handle modifications properly
223  xsm.sequence.set(sequence);
224  exportStepsAndScoresToMzTab_(match.steps_and_scores, xsm.search_engine,
225  xsm.search_engine_score, score_map);
226  const IdentificationData::Observation& query = *match.observation_ref;
227  std::vector<MzTabDouble> rts(1);
228  rts[0].set(query.rt);
229  xsm.retention_time.set(rts);
230  xsm.charge.set(match.charge);
231  xsm.exp_mass_to_charge.set(query.mz);
232  xsm.calc_mass_to_charge.set(calc_mass / abs(match.charge));
233  xsm.spectra_ref.setMSFile(file_map[query.input_file]);
234  xsm.spectra_ref.setSpecRef(query.data_id);
235  // optional column for adduct:
236  if (match.adduct_opt)
237  {
238  MzTabOptionalColumnEntry opt_adduct;
239  opt_adduct.first = "opt_adduct";
240  opt_adduct.second.set((*match.adduct_opt)->getName());
241  xsm.opt_.push_back(opt_adduct);
242  }
243  // optional columns for isotope offset:
244  // @TODO: find a way of passing in the names of relevant meta values
245  // (e.g. from NucleicAcidSearchEngine), instead of hard-coding them here
246  if (match.metaValueExists("isotope_offset"))
247  {
248  MzTabOptionalColumnEntry opt_meta;
249  opt_meta.first = "opt_isotope_offset";
250  opt_meta.second.set(match.getMetaValue("isotope_offset"));
251  xsm.opt_.push_back(opt_meta);
252  }
253  // don't repeat data from the peptide section (e.g. accessions)
254  // why are "pre"/"post"/"start"/"end" not in the peptide section?!
255  output.push_back(xsm);
256  }
257 
259  static void exportStepsAndScoresToMzTab_(
260  const IdentificationData::AppliedProcessingSteps& steps_and_scores,
261  MzTabParameterList& steps_out, std::map<Size, MzTabDouble>& scores_out,
262  std::map<IdentificationData::ScoreTypeRef, Size>& score_map);
263 
265  static void addMzTabSEScores_(
266  const std::map<IdentificationData::ScoreTypeRef, Size>& scores,
267  std::map<Size, MzTabParameter>& output);
268 
270  static void addMzTabMoleculeParentContext_(
271  const IdentificationData::ParentMatch& match,
273 
275  static void addMzTabMoleculeParentContext_(
276  const IdentificationData::ParentMatch& match,
278 
280  static IdentificationData::SearchParamRef importDBSearchParameters_(
282  IdentificationData& id_data);
283 
285  static ProteinIdentification::SearchParameters exportDBSearchParameters_(
287 
289  static void exportMSRunInformation_(
291  ProteinIdentification& protein);
292 
293  static void handleFeatureImport_(Feature& feature, IntList indexes,
294  std::vector<PeptideIdentification>& peptides,
295  Size& id_counter, bool clear_original);
296 
297  static void handleFeatureExport_(Feature& feature, const IntList& indexes,
298  IdentificationData& id_data, Size& id_counter);
299  };
300 }
OpenMS::IdentificationDataConverter::StepOpt
std::optional< IdentificationData::ProcessingStepRef > StepOpt
Definition: IdentificationDataConverter.h:98
OpenMS::IdentificationDataInternal::ParentSequence::accession
String accession
Definition: ParentSequence.h:51
OpenMS::IdentificationDataInternal::MoleculeType
MoleculeType
Definition: MetaData.h:65
OpenMS::IdentificationDataInternal::ParentSequence::sequence
String sequence
Definition: ParentSequence.h:57
OpenMS::PeptideIdentification::hasRT
bool hasRT() const
shortcut for isnan(getRT())
OpenMS::IdentificationData::AppliedProcessingSteps
IdentificationDataInternal::AppliedProcessingSteps AppliedProcessingSteps
Definition: IdentificationData.h:135
OpenMS::MetaInfoInterface::getMetaValue
const DataValue & getMetaValue(const String &name, const DataValue &default_value=DataValue::EMPTY) const
Returns the value corresponding to a string, or a default value (default: DataValue::EMPTY) if not fo...
OpenMS::String
A more convenient string class.
Definition: String.h:58
OpenMS::MzTabPeptideSectionRow
PEP - Peptide section (Table based)
Definition: MzTab.h:242
OpenMS::Size
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
OpenMS::IdentificationDataConverter::exportObservationMatchToMzTab_
static void exportObservationMatchToMzTab_(const String &sequence, const IdentificationData::ObservationMatch &match, double calc_mass, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map, std::map< IdentificationData::InputFileRef, Size > &file_map)
Export an input match (peptide- or oligonucleotide-spectrum match) to mzTab.
Definition: IdentificationDataConverter.h:214
OpenMS::IdentificationDataInternal::PROTEIN
Definition: MetaData.h:67
OpenMS::IdentificationDataInternal::ScoredProcessingResult::steps_and_scores
AppliedProcessingSteps steps_and_scores
Definition: ScoredProcessingResult.h:46
OpenMS::IdentificationDataInternal::Observation::mz
double mz
Definition: Observation.h:61
OpenMS::MzTab
Data model of MzTab files. Please see the official MzTab specification at https://code....
Definition: MzTab.h:477
OpenMS::IdentificationDataInternal::ObservationMatch::observation_ref
ObservationRef observation_ref
Definition: ObservationMatch.h:77
OpenMS::IntList
std::vector< Int > IntList
Vector of signed integers.
Definition: ListUtils.h:55
OpenMS::ProteinIdentification
Representation of a protein identification run.
Definition: ProteinIdentification.h:70
OpenMS::IdentificationDataInternal::ObservationMatch::charge
Int charge
Definition: ObservationMatch.h:79
OpenMS::IdentificationDataConverter::exportPeptideOrOligoToMzTab_
static void exportPeptideOrOligoToMzTab_(const IdentSeq &identified, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export an identified sequence (peptide or oligonucleotide, but not small molecule/compound) to mzTab.
Definition: IdentificationDataConverter.h:179
OpenMS
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
OpenMS::IdentificationDataInternal::Observation::rt
double rt
Definition: Observation.h:61
OpenMS::IdentificationDataInternal::Observation
Representation of an observation, e.g. a spectrum or feature, in an input data file.
Definition: Observation.h:53
FASTAFile.h
OpenMS::MzTabParameterList
Definition: MzTabBase.h:268
OpenMS::IdentificationData::ParentMatches
IdentificationDataInternal::ParentMatches ParentMatches
Definition: IdentificationData.h:146
OpenMS::IdentificationDataInternal::IteratorWrapper
Wrapper that adds operator< to iterators, so they can be used as (part of) keys in maps/sets or multi...
Definition: MetaData.h:45
ProteinIdentification.h
OpenMS::IdentificationDataConverter::StepOptCompare::operator()
bool operator()(const StepOpt &left, const StepOpt &right) const
Definition: IdentificationDataConverter.h:103
OpenMS::PeptideIdentification::hasMZ
bool hasMZ() const
shortcut for isnan(getRT())
OpenMS::IdentificationDataConverter::StepOptCompare
Functor for ordering StepOpt (by date of the steps, if available):
Definition: IdentificationDataConverter.h:101
OpenMS::IdentificationDataInternal::ParentSequence
Representation of a parent sequence that is identified only indirectly (e.g. a protein).
Definition: ParentSequence.h:49
MzTab.h
OpenMS::IdentificationDataInternal::ParentSequence::description
String description
Definition: ParentSequence.h:59
OpenMS::IdentificationData
Representation of spectrum identification results and associated data.
Definition: IdentificationData.h:94
OpenMS::FeatureMap
A container for features.
Definition: FeatureMap.h:98
OpenMS::PeptideIdentification
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:63
OpenMS::PeptideIdentification::getMZ
double getMZ() const
returns the MZ of the MS2 spectrum
OpenMS::Feature
An LC-MS feature.
Definition: Feature.h:70
OpenMS::MetaInfoInterface::metaValueExists
bool metaValueExists(const String &name) const
Returns whether an entry with the given name exists.
OpenMS::IdentificationDataConverter::exportParentSequenceToMzTab_
static void exportParentSequenceToMzTab_(const IdentificationData::ParentSequence &parent, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export a parent sequence (protein or nucleic acid) to mzTab.
Definition: IdentificationDataConverter.h:156
OpenMS::IdentificationDataInternal::ObservationMatch::adduct_opt
AdductOpt adduct_opt
optional reference to adduct
Definition: ObservationMatch.h:81
OpenMS::MzTabOligonucleotideSectionRow
OLI - Oligonucleotide section (table-based)
Definition: MzTab.h:396
OpenMS::IdentificationDataInternal::Observation::data_id
String data_id
Spectrum or feature ID (from the file referenced by @t input_file)
Definition: Observation.h:56
IdentificationData.h
OpenMS::IdentificationDataInternal::ParentMatch
Meta data for the association between an identified molecule (e.g. peptide) and a parent sequence (e....
Definition: ParentMatch.h:45
OpenMS::IdentificationDataConverter::PepIDCompare
Functor for ordering peptide IDs by RT and m/z (if available)
Definition: IdentificationDataConverter.h:113
OpenMS::IdentificationDataConverter
Definition: IdentificationDataConverter.h:47
PeptideIdentification.h
OpenMS::IdentificationDataInternal::ObservationMatch
Representation of a search hit (e.g. peptide-spectrum match).
Definition: ObservationMatch.h:73
OpenMS::ProteinIdentification::SearchParameters
Search parameters of the DB search.
Definition: ProteinIdentification.h:258
OpenMS::IdentificationDataInternal::ParentSequence::coverage
double coverage
sequence coverage as a fraction between 0 and 1
Definition: ParentSequence.h:61
OpenMS::IdentificationDataInternal::Observation::input_file
InputFileRef input_file
Reference to the input file.
Definition: Observation.h:59
OpenMS::MzTabOptionalColumnEntry
std::pair< String, MzTabString > MzTabOptionalColumnEntry
Definition: MzTabBase.h:229
OpenMS::PeptideIdentification::getRT
double getRT() const
returns the RT of the MS2 spectrum where the identification occurred
OpenMS::IdentificationDataConverter::PepIDCompare::operator()
bool operator()(const PeptideIdentification &left, const PeptideIdentification &right) const
Definition: IdentificationDataConverter.h:115
OpenMS::PeptideHit
Representation of a peptide hit.
Definition: PeptideHit.h:55