From 366410790ad8c402b756e245530a54a89ec7fbc2 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 19 Mar 2024 14:32:47 +0900 Subject: [PATCH 1/9] Update set mzQC for validator --- docs/pages/examples.md | 4 +- .../{set-of-runs.mzQC.md => intro_set.md} | 6 +- .../{set-of-runs.mzQC => intro_set.mzQC} | 158 ++++++++++-------- 3 files changed, 92 insertions(+), 76 deletions(-) rename docs/pages/worked-examples/{set-of-runs.mzQC.md => intro_set.md} (99%) rename specification_documents/examples/{set-of-runs.mzQC => intro_set.mzQC} (61%) diff --git a/docs/pages/examples.md b/docs/pages/examples.md index bd09aeca..6883446c 100644 --- a/docs/pages/examples.md +++ b/docs/pages/examples.md @@ -6,8 +6,8 @@ permalink: /examples/ Here are a number of worked examples, that, each for its own use-case, go step-by-step through the different parts of a mzQC. -- [Single mass spectrometry run](intro_run/) -- [Sets of runs](set-of-runs/) +- [Representing QC data for an individual mass spectrometry run](intro_run/) +- [Deriving QC data from multiple related mass spectrometry runs](intro_set/) - [QC sample mzQC](QC2-sample-example/) - [in mzML](mzml-mzqc-example/) - [Using USI with mzQC](USI-example/) diff --git a/docs/pages/worked-examples/set-of-runs.mzQC.md b/docs/pages/worked-examples/intro_set.md similarity index 99% rename from docs/pages/worked-examples/set-of-runs.mzQC.md rename to docs/pages/worked-examples/intro_set.md index dfc12f61..88ec2351 100644 --- a/docs/pages/worked-examples/set-of-runs.mzQC.md +++ b/docs/pages/worked-examples/intro_set.md @@ -1,7 +1,7 @@ --- layout: page -title: "Multi-Run (i.e. sets) Example of mzQC" -permalink: /examples/set-of-runs/ +title: "Introduction to mzQC – Multiple Mass Spectrometry Runs" +permalink: /examples/intro_set/ --- Here, we describe an mzQC JSON document used to convey QC data which is computed on a set of runs, i.e. @@ -456,4 +456,4 @@ On the other hand, ommitting the `healthy`/`diseased` setQualities is not sensib } ``` ### This is the mzQC file once again, in full: -**[sets-of-runs.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/draft_v1/examples/set-of-runs.mzQC)** \ No newline at end of file +**[sets-of-runs.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/draft_v1/examples/set-of-runs.mzQC)** diff --git a/specification_documents/examples/set-of-runs.mzQC b/specification_documents/examples/intro_set.mzQC similarity index 61% rename from specification_documents/examples/set-of-runs.mzQC rename to specification_documents/examples/intro_set.mzQC index 003ece56..4903d5ae 100644 --- a/specification_documents/examples/set-of-runs.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -4,15 +4,15 @@ "creationDate": "2020-12-01T14:19:09Z", "contactName": "Chris Bielow", "contactAddress": "chris.bielow@bsc.fu-berlin.de", - "description": "A simple mzQC file containing information for sets of runs.", + "description": "A simple mzQC file containing information for a set of multiple mass spectrometry runs.", "setQualities": [ { "metadata": { "label": "healthy", "inputFiles": [ { - "name": "tr1_healthy", - "location": "file:///C:/msdata/techRep1_healthy.mzML", + "name": "techRep1_healthy", + "location": "file://C:/msdata/techRep1_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -26,8 +26,8 @@ ] }, { - "name": "tr2_healthy", - "location": "file:///C:/msdata/techRep2_healthy.mzML", + "name": "techRep2_healthy", + "location": "file://C:/msdata/techRep2_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -41,8 +41,8 @@ ] }, { - "name": "tr3_healthy", - "location": "file:///C:/msdata/techRep3_healthy.mzML", + "name": "techRep3_healthy", + "location": "file://C:/msdata/techRep3_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -60,23 +60,29 @@ { "accession": "MS:1001058", "name": "quality estimation by manual validation", + "description": "The quality estimation was done manually.", "version": "0", "uri": "https://dx.doi.org/10.1021/pr201071t" }, { "accession": "MS:1000799", "name": "custom unreleased software tool", - "value": "mzqc-pylib", + "description": "A software tool that has not yet been released. The value should describe the software. Please do not use this term for publicly available software - contact the PSI-MS working group in order to have another CV term added.", "version": "0", - "uri": "https://hupo-psi.github.io/mzQC/unknown.html" + "uri": "https://hupo-psi.github.io/mzQC/" } ] }, "qualityMetrics": [ { - "accession": "QC:4000270", + "accession": "MS:4000XXX", "name": "protein contaminant intensity ratio", - "value": "0.25" + "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "value": 0.25, + "unit": { + "accession": "UO:0000190", + "name": "ratio" + } } ] }, @@ -85,8 +91,8 @@ "label": "diseased", "inputFiles": [ { - "name": "tr1_diseased", - "location": "file:///C:/msdata/techRep1_diseased.mzML", + "name": "techRep1_diseased", + "location": "file://C:/msdata/techRep1_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -100,8 +106,8 @@ ] }, { - "name": "tr2_diseased", - "location": "file:///C:/msdata/techRep2_diseased.mzML", + "name": "techRep2_diseased", + "location": "file://C:/msdata/techRep2_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -115,8 +121,8 @@ ] }, { - "name": "tr3_diseased", - "location": "file:///C:/msdata/techRep3_diseased.mzML", + "name": "techRep3_diseased", + "location": "file://C:/msdata/techRep3_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -134,16 +140,29 @@ { "accession": "MS:1001058", "name": "quality estimation by manual validation", + "description": "The quality estimation was done manually.", "version": "0", "uri": "https://dx.doi.org/10.1021/pr201071t" + }, + { + "accession": "MS:1000799", + "name": "custom unreleased software tool", + "description": "A software tool that has not yet been released. The value should describe the software. Please do not use this term for publicly available software - contact the PSI-MS working group in order to have another CV term added.", + "version": "0", + "uri": "https://hupo-psi.github.io/mzQC/" } ] }, "qualityMetrics": [ { - "accession": "QC:4000270", + "accession": "MS:4000XXX", "name": "protein contaminant intensity ratio", - "value": "0.31" + "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "value": 0.31, + "unit": { + "accession": "UO:0000190", + "name": "ratio" + } } ] }, @@ -152,8 +171,8 @@ "label": "all", "inputFiles": [ { - "name": "tr1_healthy", - "location": "file:///C:/msdata/techRep1_healthy.mzML", + "name": "techRep1_healthy", + "location": "file://C:/msdata/techRep1_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -167,8 +186,8 @@ ] }, { - "name": "tr2_healthy", - "location": "file:///C:/msdata/techRep2_healthy.mzML", + "name": "techRep2_healthy", + "location": "file://C:/msdata/techRep2_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -182,8 +201,8 @@ ] }, { - "name": "tr3_healthy", - "location": "file:///C:/msdata/techRep3_healthy.mzML", + "name": "techRep3_healthy", + "location": "file://C:/msdata/techRep3_healthy.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -197,8 +216,8 @@ ] }, { - "name": "tr1_diseased", - "location": "file:///C:/msdata/techRep1_diseased.mzML", + "name": "techRep1_diseased", + "location": "file://C:/msdata/techRep1_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -212,8 +231,8 @@ ] }, { - "name": "tr2_diseased", - "location": "file:///C:/msdata/techRep2_diseased.mzML", + "name": "techRep2_diseased", + "location": "file://C:/msdata/techRep2_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -227,8 +246,8 @@ ] }, { - "name": "tr3_diseased", - "location": "file:///C:/msdata/techRep3_diseased.mzML", + "name": "techRep3_diseased", + "location": "file://C:/msdata/techRep3_diseased.mzML", "fileFormat": { "accession": "MS:1000584", "name": "mzML format" @@ -240,65 +259,67 @@ "value": "2012-02-03 15:00:41" } ] + }, + { + "name": "proteinGroups", + "location": "file://C:/msdata/proteinGroups.txt", + "fileFormat": { + "accession": "MS:1002130", + "name": "identification file format" + }, + "fileProperties": [ + { + "accession": "MS:1000747", + "name": "completion time", + "value": "2012-02-03 18:00:41" + } + ] } ], "analysisSoftware": [ { "accession": "MS:1001058", "name": "quality estimation by manual validation", + "description": "The quality estimation was done manually.", "version": "0", "uri": "https://dx.doi.org/10.1021/pr201071t" + }, + { + "accession": "MS:1000799", + "name": "custom unreleased software tool", + "description": "A software tool that has not yet been released. The value should describe the software. Please do not use this term for publicly available software - contact the PSI-MS working group in order to have another CV term added.", + "version": "0", + "uri": "https://hupo-psi.github.io/mzQC/" } ] }, "qualityMetrics": [ { - "accession": "QC:4000264", - "name": "group of runs", - "value": { - "inputfile_name": [ - "tr1_healthy", - "tr2_healthy", - "tr3_healthy", - "tr1_diseased", - "tr2_diseased", - "tr3_diseased" - ], - "group-label": [ - "healthy", - "healthy", - "healthy", - "diseased", - "diseased", - "diseased" - ] - } - }, - { - "accession": "QC:4000267", - "name": "PCA table", + "accession": "MS:4000091", + "name": "principal component analysis of MaxQuant's protein group lfq intensities", + "description": "A table with the PCA results of MaxQuant's protein group lfq intensities.", "value": { - "group-label": [ + "MS:4000086": [ "healthy", "diseased" ], - "PCA Dimension 1": [ - 47.22, - -30.22 + "MS:4000081": [ + 47.2, + -30.2 ], - "PCA Dimension 2": [ + "MS:4000082": [ 29.1, -36.5 ], - "PCA Dimension 3": [ + "MS:4000083": [ 3.8, -7.3 ], - "PCA Dimension 4": [ + "MS:4000084": [ -7.7, - 5.55 + 5.6 ], - "PCA Dimension 5": [ + "MS:4000085": [ 140.6, -64.1 ] @@ -308,15 +329,10 @@ } ], "controlledVocabularies": [ - { - "name": "Proteomics Standards Initiative Quality Control Ontology", - "uri": "https://github.com/HUPO-PSI/mzQC/blob/main/cv/qc-cv.obo", - "version": "1.0.0" - }, { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.71/psi-ms.obo", - "version": "4.1.71" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", + "version": "4.1.130" } ] } From 49f4133fe50d02d7d0f9d4387a41aa1a367e4a8b Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 19 Mar 2024 15:30:45 +0900 Subject: [PATCH 2/9] Update set description --- .../{MultiSet_PCA.png => intro_set_pca.png} | Bin docs/pages/worked-examples/intro_run.md | 10 +- docs/pages/worked-examples/intro_set.md | 535 ++++-------------- .../examples/intro_set.mzQC | 6 +- 4 files changed, 134 insertions(+), 417 deletions(-) rename docs/pages/figures/{MultiSet_PCA.png => intro_set_pca.png} (100%) diff --git a/docs/pages/figures/MultiSet_PCA.png b/docs/pages/figures/intro_set_pca.png similarity index 100% rename from docs/pages/figures/MultiSet_PCA.png rename to docs/pages/figures/intro_set_pca.png diff --git a/docs/pages/worked-examples/intro_run.md b/docs/pages/worked-examples/intro_run.md index cfc92104..76c19d93 100644 --- a/docs/pages/worked-examples/intro_run.md +++ b/docs/pages/worked-examples/intro_run.md @@ -12,6 +12,7 @@ Here, we'll walk through the key components of an mzQC file, which uses a JSON-b You can explore the complete mzQC file [here](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/intro_run.mzQC), to see all of the elements in their context. An mzQC file starts with the root element `mzQC`: + ``` { "mzQC": { @@ -23,6 +24,7 @@ An mzQC file starts with the root element `mzQC`: Within `mzQC`, there are three main sections: 1. **General file information:** These attributes provide essential details about the mzQC file itself. + ``` "version": "1.0.0", "creationDate": "2020-12-01T11:56:34Z", @@ -33,6 +35,7 @@ Within `mzQC`, there are three main sections: 2. **Controlled vocabulary (CV) references:** This section points to standardized vocabularies used to ensure consistent metric definitions across files. It is typically included at the end of the mzQC file. + ``` "controlledVocabularies": [ { @@ -44,6 +47,7 @@ It is typically included at the end of the mzQC file. ``` 3. **Quality metrics for the run:** This core part of the file captures the QC metrics specific to the run being described. + ``` "runQualities": [ { @@ -55,6 +59,7 @@ It is typically included at the end of the mzQC file. In the `runQualities` section, you may find multiple `runQuality` elements, each corresponding to a unique mass spectrometry run. For simplicity, this example only includes a single run in the mzQC file. First, this includes a `metadata` part detailing the run specifics, such as the source files and software used in analysis: + ``` "metadata": { "inputFiles": [ @@ -67,6 +72,7 @@ First, this includes a `metadata` part detailing the run specifics, such as the ``` Digging a bit deeper, for example, the `inputFiles` array describes each file contributing to the run, including details like file name, location (URI), format, and properties—all standardized using CV terms. + ``` "inputFiles": [ { @@ -101,6 +107,7 @@ Finally, the `qualityMetrics` array lists the metrics derived from the run, each Metrics can take various forms, such as single values, tuples (arrays of values), or more complex structures like matrices or tables, depending on the information being conveyed. For example, a single valued metric: + ``` { "accession": "MS:4000059", @@ -111,10 +118,11 @@ For example, a single valued metric: "accession": "UO:0000189", "name": "count unit" } -} +}, ``` And a tuple metric: + ``` { "accession": "MS:4000069", diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index 88ec2351..b9be2d4e 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -4,456 +4,165 @@ title: "Introduction to mzQC – Multiple Mass Spectrometry Runs" permalink: /examples/intro_set/ --- -Here, we describe an mzQC JSON document used to convey QC data which is computed on a set of runs, i.e. -is **only interpretable in the context of this set** (group). -Of course, QC metrics which refer to each run individually can also be stored, also in the same mzQC file -(see our example `individual-runs.mzQC.md` on how to do that), but this example is about group/set metrics. +This page describes how to use mzQC for analyzing groups, or "sets," of mass spectrometry runs. +This builds upon our understanding of [using mzQC for individual runs](https://hupo-psi.github.io/mzQC/examples/intro_run/), extending it to how you can analyze and represent data from multiple runs together. +Think of a "set" as a bundle of experiments that you want to examine collectively. -Find the complete example file at the bottom of this document or in the example folder. +> [!TIP] +> Sets are versatile! +> You can group runs together, but you can also group sets within other sets. +> This allows for a structured hierarchy in your analysis, like grouping technical replicates under biological ones and then comparing across conditions. + +Discover the full example of an mzQC file for a set [here](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/examples/intro_set.mzQC). + +The structure of an mzQC file for a set mirrors that for a single run, starting with the root element `mzQC`: -The basic structure of our mzQC file is identical to the `individual-runs.mzQC` example, i.e. -the documents main anchor is between the outer curly brackets: ``` -{ "mzQC": - { +{ + "mzQC": { ... } } ``` -Within this main anchor, there are usually the following sections: -a) general information about the file, -``` - "version": "1.0.0", - "creationDate": "2020-12-21T11:56:34", - "contactName": "Chris Bielow", - "contactAddress": "chris.bielow@bsc.fu-berlin.de", - "description": "A simple mzQC file containing information for sets of runs.", -``` +Within `mzQC`, there are three main sections: -b) reference information for controlled vocabularies (cv) at the bottom, -``` - "controlledVocabularies": [ - { - "name": "Proteomics Standards Initiative Quality Control Ontology", - "uri": "https://github.com/HUPO-PSI/qcML-development/blob/master/cv/v0_1_0/qc-cv.obo", - "version": "0.1.0" - }, - { - "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/blob/master/psi-ms.obo", - "version": "4.1.7" - } - ] -``` -and (now in addition or as replacement) to the `runQualities` of the `individual-runs.mzQC` we have -c) information about the QC metrics computed on **a set of runs**. -``` - "setQualities": [ - { - ... - } - ] -``` -In fact, `setQualities` can contain one or more `setQuality` objects, each defining a different set of runs. -E.g. if you have three technical replicates for two conditions for at total of six runs, you might want to subsume three runs into a set, one for each condition and report the total number of proteins you identified, or the percentage of total intensity attributable to contaminants). Each `setQuality` object is an element of a JSON array, thus it is not explicitly named (i.e. there is no "setQuality" key in the mzQC file). -For the purpose of this example, we will use **three** `setQuality` objects (there could be none, only one or more than two though): +1. **General file information:** These attributes provide essential details about the mzQC file itself. ``` - the **healthy** set: tr1_healthy, tr2_healthy, tr3_healthy - the **diseased** set: tr1_diseased, tr2_diseased, tr3_diseased - the **all** set: tr1_healthy, tr2_healthy, tr3_healthy, tr1_diseased, tr2_diseased, tr3_diseased +"version": "1.0.0", +"creationDate": "2020-12-01T14:19:09Z", +"contactName": "Chris Bielow", +"contactAddress": "chris.bielow@bsc.fu-berlin.de", +"description": "A simple mzQC file containing information for a set of multiple mass spectrometry runs.", ``` -How you define (and name) each set, is up to you and depends on your experimental design and the kind of comparisons you want to make. +2. **Controlled vocabulary (CV) references:** This section points to standardized vocabularies used to ensure consistent metric definitions across files. +It is typically included at the end of the mzQC file. -A `setQuality` represents QC data that must be viewed in the context of all the runs of this set/group. I.e. the data is only valid within the context of the runs it comprises. E.g. it would be invalid to define a set of three runs and report their individual MS1 scan counts as a 3-tuple -- because this information can clearly be attributed to individual runs and thus belongs in three separate `runQuality` objects, rather than a single `setQuality`. -Similar to `runQuality`, a `setQuality` also contains `metadata` about the set of runs (its input file**s**, the software used, etc). -You can give the set a unique name using the `label` attribute. Here is how a `setQuality` object looks like: -``` - { - "metadata": { - "label": "healthy" - "inputFiles": - ... - }, - "qualityMetrics": [ - ... - ] - } -``` -The `inputFiles` consist of an array of `inputFile` objects, describing the source files with structured information about the file's name, format, location and other properties, defined via cv terms. ``` - "inputFiles": [ - { - "name": "tr1_healthy", - "location": "c:\msdata\techRep1_healthy.mzML", - ... - }, - { - "name": "tr2_healthy", - "location": "c:\msdata\techRep2_healthy.mzML", - ... - }, - { - "name": "tr3_healthy", - "location": "c:\msdata\techRep3_healthy.mzML", - ... - } - ] +"controlledVocabularies": [ + { + "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", + "version": "4.1.130" + } +] ``` -The `inputFile` object is only sketched here. It can contain a lot more information, such as file format and further properties. See the full example below or `individual-runs.mzQC` for details. -In `qualityMetrics`, we will store the actual QC information for a particular `setQuality`. Each `qualityMetric` has an `accession` and the corresponding `name` as defined by the QC controlled vocabulary (see `qc-cv.obo`). They should be represented exactly as stated in the .obo file. The `value` carries the actual information and can be either a single value, a tuple of values, a matrix or table. Below, we will look at single values and tables. +3. **Quality metrics for the set:** This core part of the file captures the QC metrics specific to the set being described. -Lets start with our first metric `Protein contaminant intensity ratio`. It describes the relative intensity (in [0, 1]) of all contaminant proteins (from all runs in the set) -- the higher the value the more contaminants are present in the runs of the set. ``` - "accession": "QC:0000000", - "name": "Protein contaminant intensity ratio", - "value": 0.25 +"setQualities": [ + { + ... + } +] ``` -We compute this metric for each set, in our case for the `healthy` as well as the `diseased` set, but not for the `all` set (because we want to keep the example small). But in general, what metrics you compute is up to you. +Each element within `setQualities` defines a distinct set, enabling the comparison of, say, different experimental conditions or replicate groups. + +A set's QC data is contextual—it makes sense within the bounds of the group. +For instance, it wouldn't be right to lump individual run metrics like MS1 scan counts for several runs into a single set metric; those belong to individual run analyses. + +Imagine you have several technical replicates from an experiment with two conditions, and you're interested in grouping these by technical replicates. +You might end up with sets for "healthy" and "diseased" conditions, plus a combined "all" set for overarching analyses. +As an example, we'll use three different groupings: +1. The "healthy" set, consisting of technical replicates "techRep1_healthy", "techRep2_healthy", "techRep3_healthy". +2. The "diseased" set, consisting of technical replicates "techRep1_diseased", "techRep2_diseased", "techRep3_diseased". +3. The "all" set, combining both the "healthy" and "diseased" set. -Our second example is a principal component analysis (PCA) result matrix. -The `setQuality` where this PCA metric will be stored, references **all** runs as input files. -The input table for a PCA computation can be found, for example, in MaxQuant's proteinGroups.txt output file. To stick with this example, the table in proteinGroups.txt has rows (proteins) and columns (groups, e.g. `healthy` or `diseased`), and the values in the table are protein abundances. Thus, MaxQuant has already aggregated the data from rawfiles(=runs) belonging to a certain group for us (e.g. by averaging the protein abundances). Now your QC software can derive a new table using PCA, where each group is represented by PCA coordinates. +These labels are important, acting as descriptive tags for each set, guiding your analysis. +Therefore, it is recommended to use a descriptive label, for example based on the experimental design or the kind of comparisons you want to make. -First, let's see what the PCA plot would look like: -![ Typically, the first two PCA dimensions are plotted, as shown here: Each data point in the plot represents one set(group), e.g. `diseased` or `healthy`.](../../pages/figures/MultiSet_PCA.png) -Now, let's look at the mzQC data which allows to create this plot: We use two separate metrics. One named `group of runs` to associate runs to groups, and secondly a `PCA table` metric to store the PCA data (the first 5 principal components for each group). ``` - "setQualities": [ - ..., - { - ..., - - "qualityMetrics": [ - { - "accession": "QC:4000264", - "name": "group of runs", - "value": { - "inputfile_name": ["tr1_healthy", "tr2_healthy", "tr3_healthy" , "tr1_diseased", "tr2_diseased", "tr3_diseased"], - "group-label": ["healthy" , "healthy" , "healthy" , "diseased" , "diseased" , "diseased"] - } - }, - { - "accession": "QC:4000267", - "name": "PCA table", - "value": { - "group-label": ["healthy", "diseased"], - "PCA Dimension 1": [47.22, -30.22], - "PCA Dimension 2": [29.1, -36.5], - "PCA Dimension 3": [3.8, -7.3], - "PCA Dimension 4": [-7.7, 5.55], - "PCA Dimension 5": [140.6, -64.1] - } - } - } - ] - +"metadata": { + "label": "healthy", + "inputFiles": [ + ... + ] +}, +"qualityMetrics": [ + ... ] ``` -Note: the `group of runs` metric can be defined only once per `setQuality`, but can be referenced in many metrics (here, for our `PCA table`) in that context. +`inputFiles` lists the specific files contributing to a set, with all the technical details neatly described using CV terms. -If you look closely, we somewhat defined the group `healthy` twice. Once as an individual `setQuality` and once via the `group of runs` qualityMetric in the `all` set. -There is no easy way around this. If we were to omit the `all` set, we'd need to distribute the columns of the PCA table metric into separate `setQuality` objects (and whoever wants to plot it, needs to puzzle it back together; not ideal). -On the other hand, ommitting the `healthy`/`diseased` setQualities is not sensible either, because then there would be only the `all` setQuality where all data for different subsets would need to reside. +``` +"inputFiles": [ + { + "name": "techRep1_healthy", + "location": "file://C:/msdata/techRep1_healthy.mzML", + ... + }, + { + "name": "techRep2_healthy", + "location": "file://C:/msdata/techRep2_healthy.mzML", + ... + }, + { + "name": "techRep3_healthy", + "location": "file://C:/msdata/techRep3_healthy.mzML", + ... + } +], +``` +Let's dive into an example metric, like the "protein contaminant intensity ratio," indicating how much of your sample is taken up by known contaminants. A higher value suggests more contamination: +``` +{ + "accession": "MS:4000XXX", + "name": "protein contaminant intensity ratio", + "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "value": 0.25, + "unit": { + "accession": "UO:0000190", + "name": "ratio" + } +} +``` + +For complex analyses, such as comparing protein abundances between healthy and diseased states, we might look at a PCA (principal component analysis). +mzQC can store PCA results, capturing the variation between these two states. + +First, let's have a look at what the PCA plot would look like, plotting the first two principal components: +![PCA plot of the healthy vs diseased samples.](../../pages/figures/intro_set_pca.png) +Next, we'll look at how mzQC can encapsulate such analysis, storing the the first five principal components as a table metric, referenced by the previously defined set labels. -### This is the mzQC file once again, in full: ``` { - "mzQC": { - "version": "1.0.0", - "creationDate": "2020-12-01T14:19:09", - "contactName": "Chris Bielow", - "contactAddress": "chris.bielow@bsc.fu-berlin.de", - "description": "A simple mzQC file containing information for sets of runs.", - "setQualities": [ - { - "metadata": { - "label": "healthy", - "inputFiles": [ - { - "name": "tr1_healthy", - "location": "c:\\msdata\\techRep1_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 11:00:41" - } - ] - }, - { - "name": "tr2_healthy", - "location": "c:\\msdata\\techRep2_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 13:00:41" - } - ] - }, - { - "name": "tr3_healthy", - "location": "c:\\msdata\\techRep3_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - } - ], - "analysisSoftware": [ - { - "accession": "MS:1001058", - "name": "quality estimation by manual validation", - "version": "0", - "uri": "https://dx.doi.org/10.1021/pr201071t" - } - ] - }, - "qualityMetrics": [ - { - "accession": "QC:0000000", - "name": "Protein contaminant intensity ratio", - "value": "0.25" - } - ] - }, - - { - "metadata": { - "label": "diseased", - "inputFiles": [ - { - "name": "tr1_diseased", - "location": "c:\\msdata\\techRep1_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 12:00:41" - } - ] - }, - { - "name": "tr2_diseased", - "location": "c:\\msdata\\techRep2_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - }, - { - "name": "tr3_diseased", - "location": "c:\\msdata\\techRep3_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 15:00:41" - } - ] - } - ], - "analysisSoftware": [ - { - "accession": "MS:1001058", - "name": "quality estimation by manual validation", - "version": "0", - "uri": "https://dx.doi.org/10.1021/pr201071t" - } - ] - }, - "qualityMetrics": [ - { - "accession": "QC:0000000", - "name": "Protein contaminant intensity ratio", - "value": "0.31" - } - ] - }, - - { - "metadata": { - "label": "all", - "inputFiles": [ - { - "name": "tr1_healthy", - "location": "c:\\msdata\\techRep1_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 11:00:41" - } - ] - }, - { - "name": "tr2_healthy", - "location": "c:\\msdata\\techRep2_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 13:00:41" - } - ] - }, - { - "name": "tr3_healthy", - "location": "c:\\msdata\\techRep3_healthy.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - }, - { - "name": "tr1_diseased", - "location": "c:\\msdata\\techRep1_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 12:00:41" - } - ] - }, - { - "name": "tr2_diseased", - "location": "c:\\msdata\\techRep2_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 14:00:41" - } - ] - }, - { - "name": "tr3_diseased", - "location": "c:\\msdata\\techRep3_diseased.mzML", - "fileFormat": { - "accession": "MS:1000584", - "name": "mzML format" - }, - "fileProperties": [ - { - "accession": "MS:1000747", - "name": "completion time", - "value": "2012-02-03 15:00:41" - } - ] - } - ], - "analysisSoftware": [ - { - "accession": "MS:1001058", - "name": "quality estimation by manual validation", - "version": "0", - "uri": "https://dx.doi.org/10.1021/pr201071t" - } - ] - }, - "qualityMetrics": [ - { - "accession": "QC:4000264", - "name": "group of runs", - "value": { - "inputfile_name": ["tr1_healthy", "tr2_healthy", "tr3_healthy" , "tr1_diseased", "tr2_diseased", "tr3_diseased"], - "group-label": ["healthy" , "healthy" , "healthy" , "diseased" , "diseased" , "diseased"] - } - }, - { - "accession": "QC:4000267", - "name": "PCA table", - "value": { - "group-label": ["healthy", "diseased"], - "PCA Dimension 1": [47.22, -30.22], - "PCA Dimension 2": [29.1, -36.5], - "PCA Dimension 3": [3.8, -7.3], - "PCA Dimension 4": [-7.7, 5.55], - "PCA Dimension 5": [140.6, -64.1] - } - } - ] - } - + "accession": "MS:4000090", + "name": "principal component analysis of MaxQuant's protein group raw intensities", + "description": "A table with the PCA results of MaxQuant's protein group raw intensities.", + "value": { + "MS:4000086": [ + "healthy", + "diseased" + ], + "MS:4000081": [ + 47.2, + -30.2 + ], + "MS:4000082": [ + 29.1, + -36.5 + ], + "MS:4000083": [ + 3.8, + -7.3 + ], + "MS:4000084": [ + -7.7, + 5.6 ], - "controlledVocabularies": [ - { - "name": "Proteomics Standards Initiative Quality Control Ontology", - "uri": "https://github.com/HUPO-PSI/qcML-development/blob/master/cv/v0_1_0/qc-cv.obo", - "version": "0.1.0" - }, - { - "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/blob/master/psi-ms.obo", - "version": "4.1.7" - } + "MS:4000085": [ + 140.6, + -64.1 ] } } ``` -### This is the mzQC file once again, in full: -**[sets-of-runs.mzQC](https://github.com/HUPO-PSI/mzQC/tree/main/specification_documents/draft_v1/examples/set-of-runs.mzQC)** diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 4903d5ae..799a7039 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -295,9 +295,9 @@ }, "qualityMetrics": [ { - "accession": "MS:4000091", - "name": "principal component analysis of MaxQuant's protein group lfq intensities", - "description": "A table with the PCA results of MaxQuant's protein group lfq intensities.", + "accession": "MS:4000090", + "name": "principal component analysis of MaxQuant's protein group raw intensities", + "description": "A table with the PCA results of MaxQuant's protein group raw intensities.", "value": { "MS:4000086": [ "healthy", From 509f9c0261347b0204b830bfe8e192c5f7e8c0dc Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Wed, 20 Mar 2024 09:33:00 +0900 Subject: [PATCH 3/9] Add temporary accession number --- docs/pages/worked-examples/intro_set.md | 12 +++++------ .../examples/intro_set.mzQC | 20 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index b9be2d4e..b117a61e 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -44,8 +44,8 @@ It is typically included at the end of the mzQC file. "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", - "version": "4.1.130" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.144/psi-ms.obo", + "version": "4.1.144" } ] ``` @@ -113,13 +113,13 @@ Let's dive into an example metric, like the "protein contaminant intensity ratio ``` { - "accession": "MS:4000XXX", + "accession": "MS:4000177", "name": "protein contaminant intensity ratio", - "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { - "accession": "UO:0000190", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ``` diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 799a7039..90920a98 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -75,13 +75,13 @@ }, "qualityMetrics": [ { - "accession": "MS:4000XXX", + "accession": "MS:4000177", "name": "protein contaminant intensity ratio", - "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { - "accession": "UO:0000190", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ] @@ -155,13 +155,13 @@ }, "qualityMetrics": [ { - "accession": "MS:4000XXX", + "accession": "MS:4000177", "name": "protein contaminant intensity ratio", - "description": "The ratio of intensity covered by a predefined list of contaminant proteins compared to the total ion intensity.", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.31, "unit": { - "accession": "UO:0000190", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ] @@ -331,8 +331,8 @@ "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.130/psi-ms.obo", - "version": "4.1.130" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.144/psi-ms.obo", + "version": "4.1.144" } ] } From 7d646a594d16815ee51c41580b42b3189f013bf4 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 9 Jul 2024 10:33:58 +0200 Subject: [PATCH 4/9] Update example --- docs/pages/worked-examples/intro_set.md | 72 +++++++++++++++---- .../examples/intro_set.mzQC | 8 +-- 2 files changed, 63 insertions(+), 17 deletions(-) diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index b117a61e..70be2920 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -4,9 +4,9 @@ title: "Introduction to mzQC – Multiple Mass Spectrometry Runs" permalink: /examples/intro_set/ --- -This page describes how to use mzQC for analyzing groups, or "sets," of mass spectrometry runs. +In mzQC, collections of mass spectrometry runs are grouped into what are termed "sets." This builds upon our understanding of [using mzQC for individual runs](https://hupo-psi.github.io/mzQC/examples/intro_run/), extending it to how you can analyze and represent data from multiple runs together. -Think of a "set" as a bundle of experiments that you want to examine collectively. +Think of a "set" as a bundle of runs that you want to examine collectively, such as technical and biological replicates. > [!TIP] > Sets are versatile! @@ -64,15 +64,17 @@ Each element within `setQualities` defines a distinct set, enabling the comparis A set's QC data is contextual—it makes sense within the bounds of the group. For instance, it wouldn't be right to lump individual run metrics like MS1 scan counts for several runs into a single set metric; those belong to individual run analyses. +Instead, set metrics reflect the collective characteristics of all runs within the set, offering insights into the overall experimental quality. Imagine you have several technical replicates from an experiment with two conditions, and you're interested in grouping these by technical replicates. You might end up with sets for "healthy" and "diseased" conditions, plus a combined "all" set for overarching analyses. As an example, we'll use three different groupings: + 1. The "healthy" set, consisting of technical replicates "techRep1_healthy", "techRep2_healthy", "techRep3_healthy". 2. The "diseased" set, consisting of technical replicates "techRep1_diseased", "techRep2_diseased", "techRep3_diseased". 3. The "all" set, combining both the "healthy" and "diseased" set. -These labels are important, acting as descriptive tags for each set, guiding your analysis. +These labels are important, acting as tags for each set, guiding your analysis. Therefore, it is recommended to use a descriptive label, for example based on the experimental design or the kind of comparisons you want to make. ``` @@ -109,24 +111,58 @@ Therefore, it is recommended to use a descriptive label, for example based on th ], ``` -Let's dive into an example metric, like the "protein contaminant intensity ratio," indicating how much of your sample is taken up by known contaminants. A higher value suggests more contamination: +Let's dive into an example metric, like the "protein contaminant intensity ratio." +This metric quantifies the abundance arising from known contaminant proteins (like keratins from skin or BSA from sample buffers) compared to the total abundance across all proteins in the sample. +High levels of contaminants can indicate issues with sample preparation or handling, leading to potential biases in the data analysis. ``` { - "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", - "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", - "value": 0.25, - "unit": { - "accession": "UO:0000191", - "name": "fraction" - } + "metadata": { + "label": "healthy", + ... + }, + "qualityMetrics": [ + { + "accession": "MS:4000177", + "name": "protein contaminant intensity ratio", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "value": 0.25, + "unit": { + "accession": "UO:0000191", + "name": "fraction" + } + } + ] +}, +{ + "metadata": { + "label": "diseased", + ... + }, + "qualityMetrics": [ + { + "accession": "MS:4000177", + "name": "protein contaminant intensity ratio", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "value": 0.31, + "unit": { + "accession": "UO:0000191", + "name": "fraction" + } + } + ] } ``` -For complex analyses, such as comparing protein abundances between healthy and diseased states, we might look at a PCA (principal component analysis). +While this metric can be calculated for each run individually, here we have aggregated that information across both the "healthy" and "diseased" sets instead. + +For our second example, we'll use the "all" set that combines the previous "healthy" and "diseased" sets. +To compare protein abundances between healthy and diseased states, we might look at a PCA (principal component analysis). mzQC can store PCA results, capturing the variation between these two states. +For this we extracted protein abundances from the `proteinGroups.txt` file specified as an input file to the "all" set. +This file is produced by MaxQuant and contains quantified protein intensities along with other identification information for each protein group detected in the experiment. + First, let's have a look at what the PCA plot would look like, plotting the first two principal components: ![PCA plot of the healthy vs diseased samples.](../../pages/figures/intro_set_pca.png) @@ -166,3 +202,13 @@ Next, we'll look at how mzQC can encapsulate such analysis, storing the the firs } } ``` + +Note how the principal components are represented as columns in a table, with each column defined by a CV term. +Additionally, the label is represented by CV term `MS:4000086`, in this case referring to the previous "healthy" and "diseased" sets. +This label can refer to any input files or metadata labels defined in the same mzQC file. +Consequently, we could also have performed the PCA analysis on each input file separately, in which cases the labels would have been the names of the individual input files ("techRep1_healthy", "techRep2_healthy", ..., "techRep3_diseased"). +Thus, the table metric can have a flexible number of rows, based on the input of this set and the grouping level used. + +> [!WARNING] +> It would not have been valid to perform a PCA on only the three healthy samples or only the three diseased samples. +> As mentioned previously, QC metrics in sets need to relate to _all_ elements in the set, and the current set includes both the healthy and diseased subsets. diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 90920a98..aafc985b 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -76,7 +76,7 @@ "qualityMetrics": [ { "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", + "name": "contaminant protein abundance fraction", "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { @@ -156,7 +156,7 @@ "qualityMetrics": [ { "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", + "name": "contaminant protein abundance fraction", "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.31, "unit": { @@ -331,8 +331,8 @@ "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.144/psi-ms.obo", - "version": "4.1.144" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.157/psi-ms.obo", + "version": "4.1.157" } ] } From 70bf32b5c700b66b46be61efbbfe38065e93f8f2 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 9 Jul 2024 13:54:44 +0200 Subject: [PATCH 5/9] Fix typo Co-authored-by: Chris Bielow --- docs/pages/worked-examples/intro_set.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index 70be2920..35aa7dc7 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -167,7 +167,7 @@ First, let's have a look at what the PCA plot would look like, plotting the firs ![PCA plot of the healthy vs diseased samples.](../../pages/figures/intro_set_pca.png) -Next, we'll look at how mzQC can encapsulate such analysis, storing the the first five principal components as a table metric, referenced by the previously defined set labels. +Next, we'll look at how mzQC can encapsulate such analysis, storing the first five principal components as a table metric, referenced by the previously defined set labels. ``` { From cca63578c7ee0530a9e0b595b0bd37bcd0675d8e Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Wed, 24 Jul 2024 15:36:16 +0200 Subject: [PATCH 6/9] Fix protein contaminant metric --- docs/pages/worked-examples/intro_set.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index 70be2920..c8546ac4 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -111,7 +111,7 @@ Therefore, it is recommended to use a descriptive label, for example based on th ], ``` -Let's dive into an example metric, like the "protein contaminant intensity ratio." +Let's dive into an example metric, like the "contaminant protein abundance ratio." This metric quantifies the abundance arising from known contaminant proteins (like keratins from skin or BSA from sample buffers) compared to the total abundance across all proteins in the sample. High levels of contaminants can indicate issues with sample preparation or handling, leading to potential biases in the data analysis. @@ -124,12 +124,12 @@ High levels of contaminants can indicate issues with sample preparation or handl "qualityMetrics": [ { "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", - "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "name": "contaminant protein abundance ratio", + "description": "The ratio of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { - "accession": "UO:0000191", - "name": "fraction" + "accession": "UO:0010006", + "name": "ratio" } } ] @@ -142,12 +142,12 @@ High levels of contaminants can indicate issues with sample preparation or handl "qualityMetrics": [ { "accession": "MS:4000177", - "name": "protein contaminant intensity ratio", - "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "name": "contaminant protein abundance ratio", + "description": "The ratio of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.31, "unit": { - "accession": "UO:0000191", - "name": "fraction" + "accession": "UO:0010006", + "name": "ratio" } } ] From 10d482983ce3f72a09dd1bf1dc465337d39e7f37 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Wed, 24 Jul 2024 15:41:35 +0200 Subject: [PATCH 7/9] Update the mzQC file as well --- specification_documents/examples/intro_set.mzQC | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index aafc985b..4f8464be 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -76,11 +76,11 @@ "qualityMetrics": [ { "accession": "MS:4000177", - "name": "contaminant protein abundance fraction", - "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "name": "contaminant protein abundance ratio", + "description": "The ratio of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { - "accession": "UO:0000191", + "accession": "UO:0010006", "name": "fraction" } } @@ -156,11 +156,11 @@ "qualityMetrics": [ { "accession": "MS:4000177", - "name": "contaminant protein abundance fraction", - "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "name": "contaminant protein abundance ratio", + "description": "The ratio of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.31, "unit": { - "accession": "UO:0000191", + "accession": "UO:0010006", "name": "fraction" } } From 5a8dc72b16d64826e2765447c4431edfee92fe35 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Wed, 24 Jul 2024 15:42:40 +0200 Subject: [PATCH 8/9] Update the OBO version --- specification_documents/examples/intro_set.mzQC | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 4f8464be..39d60289 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -331,8 +331,8 @@ "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.157/psi-ms.obo", - "version": "4.1.157" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.163/psi-ms.obo", + "version": "4.1.163" } ] } From e898d528c73eb8337819b70ff0a5d6d6d7623b5f Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 29 Jul 2024 12:57:53 +0200 Subject: [PATCH 9/9] Updated CV term definition --- docs/pages/worked-examples/intro_set.md | 22 +++++++++---------- .../examples/intro_set.mzQC | 16 +++++++------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/pages/worked-examples/intro_set.md b/docs/pages/worked-examples/intro_set.md index aecf5533..8bf6314a 100644 --- a/docs/pages/worked-examples/intro_set.md +++ b/docs/pages/worked-examples/intro_set.md @@ -44,8 +44,8 @@ It is typically included at the end of the mzQC file. "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.144/psi-ms.obo", - "version": "4.1.144" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.165/psi-ms.obo", + "version": "4.1.165" } ] ``` @@ -111,7 +111,7 @@ Therefore, it is recommended to use a descriptive label, for example based on th ], ``` -Let's dive into an example metric, like the "contaminant protein abundance ratio." +Let's dive into an example metric, like the "contaminant protein abundance fraction." This metric quantifies the abundance arising from known contaminant proteins (like keratins from skin or BSA from sample buffers) compared to the total abundance across all proteins in the sample. High levels of contaminants can indicate issues with sample preparation or handling, leading to potential biases in the data analysis. @@ -124,12 +124,12 @@ High levels of contaminants can indicate issues with sample preparation or handl "qualityMetrics": [ { "accession": "MS:4000177", - "name": "contaminant protein abundance ratio", - "description": "The ratio of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "name": "contaminant protein abundance fraction", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { - "accession": "UO:0010006", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ] @@ -142,12 +142,12 @@ High levels of contaminants can indicate issues with sample preparation or handl "qualityMetrics": [ { "accession": "MS:4000177", - "name": "contaminant protein abundance ratio", - "description": "The ratio of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "name": "contaminant protein abundance fraction", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.31, "unit": { - "accession": "UO:0010006", - "name": "ratio" + "accession": "UO:0000191", + "name": "fraction" } } ] diff --git a/specification_documents/examples/intro_set.mzQC b/specification_documents/examples/intro_set.mzQC index 39d60289..07819483 100644 --- a/specification_documents/examples/intro_set.mzQC +++ b/specification_documents/examples/intro_set.mzQC @@ -76,11 +76,11 @@ "qualityMetrics": [ { "accession": "MS:4000177", - "name": "contaminant protein abundance ratio", - "description": "The ratio of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "name": "contaminant protein abundance fraction", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.25, "unit": { - "accession": "UO:0010006", + "accession": "UO:0000191", "name": "fraction" } } @@ -156,11 +156,11 @@ "qualityMetrics": [ { "accession": "MS:4000177", - "name": "contaminant protein abundance ratio", - "description": "The ratio of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", + "name": "contaminant protein abundance fraction", + "description": "The fraction of total protein abundance in a mass spectrometry run or a group of runs which can be attributed to a user-defined list of contaminant proteins (e.g. using the cRAP contaminant database).", "value": 0.31, "unit": { - "accession": "UO:0010006", + "accession": "UO:0000191", "name": "fraction" } } @@ -331,8 +331,8 @@ "controlledVocabularies": [ { "name": "Proteomics Standards Initiative Mass Spectrometry Ontology", - "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.163/psi-ms.obo", - "version": "4.1.163" + "uri": "https://github.com/HUPO-PSI/psi-ms-CV/releases/download/v4.1.165/psi-ms.obo", + "version": "4.1.165" } ] }