- https://github.com/jotsetung/metabolomics2018
- Open xcms-preprocessing.Rmd in e.g. RStudio.
This presentation focuses on updates of xcms
:
- re-use data structures from Bioconductor’s
MSnbase
package - simplified raw data access
24 June 2018; Updated 20 June 2019
This presentation focuses on updates of xcms
:
MSnbase
packagefls <- dir(system.file("sciex", package = "msdata"), full.names = TRUE) ## Define a data.frame with additional information on the files. pd <- data.frame(file = basename(fls), injection_idx = c(1, 19), sample = c("POOL_1", "POOL_2"), group = "POOL")
readMSData
function.## Read the data data <- readMSData(fls, pdata = new("NAnnotatedDataFrame", pd), mode = "onDisk")
mode = "onDisk"
: reads only spectrum header from files, but no data.pData
or $
:## Access phenotype information pData(data)
## file injection_idx sample group ## 1 20171016_POOL_POS_1_105-134.mzML 1 POOL_1 POOL ## 2 20171016_POOL_POS_3_105-134.mzML 19 POOL_2 POOL
## Or individual columns directly using the $ operator data$injection_idx
## [1] 1 19
msLevel
, centroided
, rtime
, polarity
.spectra
, mz
, intensity
: reads data from files.filterFile
subset to individual files/samples.filterRtime
restrict to specific retention time window.filterMz
restrict to m/z range.filterMsLevel
subset to certain MS level(s).%>%
(pipe) operator to avoid nested function calls.## Get all spectra measured between 180 and 181 seconds ## Use %>% for better readability sps <- data %>% filterRt(rt = c(180, 181)) %>% spectra
## How many spectra? length(sps)
## [1] 6
## From which file? sapply(sps, fromFile)
## F1.S646 F1.S647 F1.S648 F2.S646 F2.S647 F2.S648 ## 1 1 1 2 2 2
plot(sps[[6]])
chromatogram
: extract chromatographic data.
Example: XIC for Serine (m/z of [M+H]+ adduct 106.0455).
data %>% filterRt(rt = c(175, 189)) %>% filterMz(mz = c(106.02, 106.07)) %>% chromatogram(aggregationFun = "max") %>% plot()
xcms
, specifically centWave was designed for centroided data.MSnbase
provides basic tools to perform MS data smoothing and centroiding: smooth
and pickPeaks
.data %>% filterRt(rt = c(175, 189)) %>% filterMz(mz = c(106.02, 106.07)) %>% plot(type = "XIC")
type = "XIC"
creates a combined chromatographic and map visualization of the data.?pickPeaks
for more advanced options.## Smooth the signal, then do a simple peak picking. data_cent <- data %>% smooth(method = "SavitzkyGolay", halfWindowSize = 6) %>% pickPeaks()
## Plot the centroided data for Serine data_cent %>% filterRt(rt = c(175, 189)) %>% filterMz(mz = c(106.02, 106.07)) %>% plot(type = "XIC")
## Write the centroided data to files with the same names in the current ## directory fls_new <- basename(fileNames(data)) writeMSData(data_cent, file = fls_new) ## Read the centroided data. data_cent <- readMSData(fls_new, pdata = new("NAnnotatedDataFrame", pd), mode = "onDisk")
findChromPeaks
.MatchedFilterParam
) [Smith Anal. chem. 2006].CentWaveParam
) [Tautenhahn BMC Bioinformatics 2008].MassifquantParam
[Conley Bioinformatics 2014].[Tautenhahn BMC Bionformatics 2008]
[Tautenhahn BMC Bionformatics 2008]
Crucial centWave parameters: peakwidth
, ppm
; list all with ?CentWaveParam
.
peakwidth
: minimal and maximal expected peak width.
srn_chr <- chromatogram(data_cent, rt = c(165, 200), mz = c(106.03, 106.06), aggregationFun = "max") plot(srn_chr)
Chromatogram
objects.cwp <- CentWaveParam() findChromPeaks(srn_chr, param = cwp)
peakwidth
?peakwidth(cwp)
## [1] 20 50
peakwidth
does not match the current data.peakwidth
and run peak detection again.chromatogram
will also extract identified chrom peaks.peakwidth(cwp) <- c(2, 10) srn_chr <- findChromPeaks(srn_chr, param = cwp) ## Plot the data and higlight identified peak area plot(srn_chr)
ppm
: maximal allowed scattering of m/z values for one ion.
Example: evaluate the m/z scattering of the signal for Serine.
## Restrict the data to signal from Serine srn <- data_cent %>% filterRt(rt = c(179, 186)) %>% filterMz(mz = c(106.04, 106.06)) ## Plot the data plot(srn, type = "XIC")
## Extract mz values for Serine from first file srn_mz <- unlist(mz(filterFile(srn, 1))) ## The difference between m/z values from consecutive scans in ppm diff(srn_mz) * 1e6 / mean(srn_mz)
## F1.S643 F1.S644 F1.S645 F1.S646 F1.S647 ## 13.695973646 -27.391665930 1.112565444 13.695804399 0.000000000 ## F1.S648 F1.S649 F1.S650 F1.S651 F1.S652 ## -0.158840806 0.000000000 0.000000000 -0.682098923 0.000000000 ## F1.S653 F1.S654 F1.S655 F1.S656 F1.S657 ## 0.000000000 0.007189239 -13.695795336 13.695795336 -12.807200180 ## F1.S658 F1.S659 F1.S660 F1.S661 F1.S662 ## 0.000000000 0.000000000 13.443799681 0.000000000 -13.695795190 ## F1.S663 F1.S664 F1.S665 F1.S666 ## 13.957010392 0.000000000 0.000000000 -14.085629933
ppm
: large enough to capture the full chromatographic peak.## Perform peak detection ppm(cwp) <- 30 data_cent <- findChromPeaks(data_cent, param = cwp)
XCMSnExp
object extends the OnDiskMSnExp
, contains preprocessing results and enables data access as described above.
chromPeaks
to access the peak detection results.head(chromPeaks(data_cent), n = 5)
## mz mzmin mzmax rt rtmin rtmax into intb ## CP001 111.0443 111.0431 111.0476 25.670 24.554 27.065 574.3303 562.2062 ## CP002 129.0541 129.0522 129.0553 25.391 24.275 27.065 806.1325 770.1417 ## CP003 114.0727 114.0715 114.0731 26.786 25.670 28.460 767.0744 764.5634 ## CP004 111.0057 111.0044 111.0073 28.739 27.623 29.297 397.1809 395.7859 ## CP005 127.0387 127.0378 127.0410 28.739 27.902 29.576 227.0865 223.2144 ## maxo sn sample ## CP001 578.7692 30 1 ## CP002 733.1538 21 1 ## CP003 498.2657 497 1 ## CP004 400.4895 399 1 ## CP005 231.5594 16 1
adjustRtime
.ObiwarpParam
) [Prince Anal. chem. 2006]: warps the (full) data to a reference sample.PeakGroupsParam
) [Smith Anal. chem. 2006]:
## Define the settings for the initial peak grouping pdp <- PeakDensityParam(sampleGroups = data_cent$group, bw = 1.8, minFraction = 1, binSize = 0.02) data_cent <- groupChromPeaks(data_cent, pdp)
## Define settings for the alignment pgp <- PeakGroupsParam(minFraction = 1, span = 0.6) data_cent <- adjustRtime(data_cent, param = pgp)
plotAdjustedRtime(data_cent)
## Use adjustedRtime parameter to access raw/adjusted retention times par(mfrow = c(1, 2), mar = c(4, 4.5, 0.9, 0.5)) plot(chromatogram(data_cent, mz = c(106.04, 106.06), rt = c(179, 186), adjustedRtime = FALSE)) plot(chromatogram(data_cent, mz = c(106.04, 106.06), rt = c(179, 186)))
groupChromPeaks
.PeakDensityParam
) [Smith Anal. chem. 2006].NearestPeaksParam
) [Katajamaa Bioinformatics 2006].plotChromPeakDensity
: plot distribution of identified peaks along rt for a given m/z slice; simulates correspondence analysis.plotChromPeakDensity
to simulate a correspondence analysis in the same slice.## Extract a BPC for an m/z slice containing serine bpc_srn <- chromatogram(data_cent, mz = c(106.04, 106.06), aggregationFun = "max") ## Get default parameters for the grouping pdp <- PeakDensityParam(sampleGroups = data_cent$group) ## Dry-run correspondence and show the results. plotChromPeakDensity(bpc_srn, param = pdp)
binSize
: m/z width of the data slice in which peaks are grouped.bw
defines the smoothness of the density function.maxFeatures
: maximum number of features to be defined in one bin.minFraction
: minimum proportion of samples (of one group!) for which a peak has to be present.minSamples
: minimum number of samples a peak has to be present.minFraction
and minSamples
depend on experimental layout!binSize
should be small enough to avoid peaks from different ions measured at similar retention times to be grouped together.bw
is the most important parameter.## Plot the chromatogram for an m/z slice containing Betaine and Valine mzr <- 118.08625 + c(-0.01, 0.01) chr <- chromatogram(data_cent, mz = mzr, aggregationFun = "max") ## Correspondence in that slice using default settings pdp <- PeakDensityParam(sampleGroups = data_cent$group) plotChromPeakDensity(chr, param = pdp)
bw
(30
) too large for present data set.plotChromPeakDensity
allows to evaluate and tune settings on data subsets.
Test smaller bw
(1.8
) on the same slice.
## Reducing the bandwidth pdp <- PeakDensityParam(sampleGroups = data_cent$group, bw = 1.8) plotChromPeakDensity(chr, param = pdp)
bw
enabled grouping of isomers into different features.pdp <- PeakDensityParam(sampleGroups = data_cent$group, bw = 1.8, minFraction = 0.4, binSize = 0.02) ## Perform the correspondence analysis data_cent <- groupChromPeaks(data_cent, param = pdp)
plotChromPeakDensity
with simulate = FALSE
shows the actual results from the correspondence.XCMSnExp
object, can be accessed with featureDefinitions
.featureValues
to access the features’ abundance estimates.## feature intensity matrix fmat <- featureValues(data_cent, value = "into", method = "maxint") head(fmat)
## 20171016_POOL_POS_1_105-134.mzML 20171016_POOL_POS_3_105-134.mzML ## FT001 3159.7569 3093.752 ## FT002 4762.3987 NA ## FT003 744.8752 1033.232 ## FT004 20211.2634 15839.550 ## FT005 10220.8762 10837.710 ## FT006 19653.1073 31816.844
featureValues
parameters:
value
: name of the column in chromPeaks
that should be returned.method
: for features with multiple peaks in one sample: from which peak should the value be returned?fillChromPeaks
allows to fill-in signal for missing peaks from the feature area (defined by the median rt and mz of all peaks assigned to the feature).fillChromPeaks
Parameters:
expandMz
, expandRt
: expands the region from which signal is integrated in m/z or rt dimension. A value of 0 means no expansion, 1 means the region is grown by half of the feature’s m/z width on both sides.ppm
: expand the m/z width by a m/z dependent value.MSnbase
, mzR
.xcms
, mzR