R/preproc_utils.R

Defines functions ImportRawMSDataList ImportRawMSData SetClass PlotEIC SetPeakParam PerformPeakProfiling SetAnnotationParam PerformPeakAnnotation FormatPeakList ExtractMS2data .getDdaMS2Scans PlotMS2Spectra

Documented in ExtractMS2data FormatPeakList ImportRawMSData ImportRawMSDataList PerformPeakAnnotation PerformPeakProfiling PlotEIC PlotMS2Spectra SetAnnotationParam SetClass SetPeakParam

#' Import raw MS data
#' @description This function handles the reading in of 
#' raw MS data (.mzML, .CDF and .mzXML). Users must provide 
#' a matrix with meta information about file such that each file has the name,
#' file path, group class and extension type.
#' The function will output two chromatograms into the user's working directory, a 
#' base peak intensity chromatogram (BPIC) and a total ion 
#' chromatogram (TIC). Further, this function sets the number of cores
#' to be used for parallel processing. It first determines the number of cores 
#' within a user's computer and then sets it that number/2.  
#' @param dataset.meta Matrix, input the meta data for files containing
#' the raw MS spectra to be processed.
#' @param format Character, input the format of the image to create.
#' @param dpi Numeric, input the dpi of the image to create.
#' @param width Numeric, input the width of the image to create.
#' @param par.cores Logical, if true, the function will automatically 
#' set the number of parallel cores. If false, it will not.
#' @param plot Logical, if true the function will create BPIS and TICS plots.
#' @param bpis_name Character, input the name of the BPIS image to create.
#' @param tics_name Character, input the name of the TICS image to create.
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export
#' @import xcms
#' @import MSnbase
#' @import BiocParallel
#' @import parallel

ImportRawMSDataList <- function(dataset.meta, format = "png", dpi = 72, width = 9, 
                            par.cores=TRUE, plot=TRUE, bpis_name = "BPIS_", tics_name="TICS_"){
  
  msg.vec <<- vector(mode="character")

  if(bpis_name == "BPIS_"){
	bpis_name = paste("BPIS_", dpi, ".", format, sep="");
  }
  if(tics_name == "TICS_"){
	tics_name <- paste("TICS_", dpi, ".", format, sep="");
  }
  
  msg <- c("The uploaded files are raw MS spectra.");
  
  # The dataset.meta should be a matrix such that each row has the following:
  #   1- file name, 2- file path, 3- group and 4- file extension type
  # provided. The accepted file extensions are (.mzML/.CDF/.mzXML files)
  
  if(nrow(dataset.meta) == 0 || is.na(dataset.meta)){
    AddErrMsg("No spectra were found!");
    return(0);    
  }
  
  compfile.types <- sum(sapply(dataset.meta[,3], function(x){x %in% c("mzml", "cdf", "mzxml")}))
  if(compfile.types < nrow(dataset.meta)){
    AddErrMsg("Only mzML, cdf and mzXML input types can be handled!");
    return(0);    
  }

  snames <- dataset.meta[,1]
  files <- dataset.meta[,2]; files <- as.character(files); # Otherwise, a factor form of files will cause an error
  sclass <- dataset.meta[,4]

  # some sanity check before proceeds
  sclass <- as.factor(sclass);
  if(length(levels(sclass))<2){
    AddErrMsg("You must provide classes labels (at least two classes)!");
    return(0);
  }
  
  SetClass(sclass)
  
  # check for unique sample names
  if(length(unique(snames))!=length(snames)){
    AddErrMsg("Duplicate sample names are not allowed!");
    dup.nm <- paste(snames[duplicated(snames)], collapse=" ");
    AddErrMsg("Duplicate sample names are not allowed!");
    AddErrMsg(dup.nm);
    return(0);
  }
  
  pd <- data.frame(sample_name = snames,
                   sample_group = sclass,
                   stringsAsFactors = FALSE)
  
  if(!.on.public.web & par.cores==TRUE){
    cores <- parallel::detectCores()
    num_cores <- ceiling(cores/2) 
    print(paste0("The number of CPU cores to be used is set to ", num_cores, "."))
    
    if (.Platform$OS.type == "unix") {
      BiocParallel::register(BiocParallel::bpstart(BiocParallel::MulticoreParam(num_cores)))
    } else { # for windows
      BiocParallel::register(BiocParallel::bpstart(BiocParallel::SnowParam(num_cores)))
    }
  }
  
  raw_data <- suppressMessages(readMSData(files = files, pdata = new("NAnnotatedDataFrame", pd),
                                          mode = "onDisk")) 
  
  if(plot==TRUE){
    # Plotting functions to see entire chromatogram
    bpis <- chromatogram(raw_data, aggregationFun = "max")
    tics <- chromatogram(raw_data, aggregationFun = "sum")
    
    groupNum <- nlevels(groupInfo)
    
    if(groupNum > 9){
      col.fun <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(12, "Set3"))
      group_colors <- col.fun(groupNum)
    }else{
      group_colors <- paste0(RColorBrewer::brewer.pal(9, "Set1")[1:groupNum], "60")
    }
    
    names(group_colors) <- levels(groupInfo)
    
    Cairo::Cairo(file = bpis_name, unit="in", dpi=dpi, width=width, height= width*5/9, 
                 type=format, bg="white");
    plot(bpis, col = group_colors[raw_data$sample_group])
    legend("topright", legend=levels(groupInfo), pch=15, col=group_colors);
    dev.off();
    
    Cairo::Cairo(file = tics_name, unit="in", dpi=dpi, width=width, height=width*5/9, 
                 type=format, bg="white");
    plot(tics, col = group_colors[raw_data$sample_group])
    legend("topright", legend=levels(groupInfo), pch=15, col=group_colors);
    dev.off();
  }
  
  print("Successfully imported raw MS data!")
  
  return(raw_data)
}

#' Import raw MS data
#' @description This function handles the reading in of 
#' raw MS data (.mzML, .CDF and .mzXML). Users must set 
#' their working directory to the folder containing their raw 
#' data, divided into two subfolders named their desired group labels. The  
#' function will output two chromatograms into the user's working directory, a 
#' base peak intensity chromatogram (BPIC) and a total ion 
#' chromatogram (TIC). Further, this function sets the number of cores
#' to be used for parallel processing. It first determines the number of cores 
#' within a user's computer and then sets it that number/2.  
#' @param foldername Character, input the file path to the folder containing
#' the raw MS spectra to be processed.
#' @param format Character, input the format of the image to create.
#' @param dpi Numeric, input the dpi of the image to create.
#' @param width Numeric, input the width of the image to create.
#' @param par.cores Logical, if true, the function will automatically 
#' set the number of parallel cores. If false, it will not.
#' @param plot Logical, if true the function will create BPIS and TICS plots.
#' @param plot.opts By default, it will create BPIS and TICS plots using up to 10 samples
#' per group. Set to "all" to create plots using all samples, though this may cause memory issues.
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export
#' @import xcms
#' @import MSnbase
#' @import BiocParallel
#' @import parallel

ImportRawMSData <- function(foldername, format = "png", dpi = 72, width = 9, 
                            par.cores=TRUE, plot=TRUE, plot.opts="default"){

  msg.vec <<- vector(mode="character")
  
  msg <- c("The uploaded files are raw MS spectra.");
  
  # the "upload" folder should contain two subfolders (groups, i.e. Healthy vs. Disease)
  # each subfolder must contain samples (.mzML/.CDF/.mzXML files)
  
  files <- dir(foldername, pattern=".mzML|.cdf|.mzXML|.mzData", recursive=T, full.name=TRUE)
  
  if (length(files) == 0) {
    AddErrMsg("No spectra were found!");
    return(0);
  }
  
  snames <- gsub("\\.[^.]*$", "", basename(files));
  msg<-c(msg, paste("A total of ", length(files), "samples were found."));
  
  sclass <- gsub("^\\.$", "sample", dirname(files));
  
  scomp <- strsplit(substr(sclass, 1, min(nchar(sclass))), "");
  scomp <- matrix(c(scomp, recursive = TRUE), ncol = length(scomp));
  i <- 1
  while(all(scomp[i,1] == scomp[i,-1]) && i < nrow(scomp)){
    i <- i + 1;
  }
  i <- min(i, tail(c(0, which(scomp[1:i,1] == .Platform$file.sep)), n = 1) + 1)
  if (i > 1 && i <= nrow(scomp)){
    sclass <- substr(sclass, i, max(nchar(sclass)))
  }
  
  # some sanity check before proceeds
  sclass <- as.factor(sclass);
  if(length(levels(sclass))<2){
    AddErrMsg("You must provide classes labels (at least two classes)!");
    return(0);
  }
  
  SetClass(sclass)
  
  # check for unique sample names
  if(length(unique(snames))!=length(snames)){
    AddErrMsg("Duplicate sample names are not allowed!");
    dup.nm <- paste(snames[duplicated(snames)], collapse=" ");
    AddErrMsg("Duplicate sample names are not allowed!");
    AddErrMsg(dup.nm);
    return(0);
  }
  
  pd <- data.frame(sample_name = snames,
                   sample_group = sclass,
                   stringsAsFactors = FALSE)
  
  if(!.on.public.web & par.cores==TRUE){
    cores <- parallel::detectCores()
    num_cores <- ceiling(cores/2) 
    print(paste0("The number of CPU cores to be used is set to ", num_cores, "."))
    
    if (.Platform$OS.type == "unix") {
      BiocParallel::register(BiocParallel::bpstart(BiocParallel::MulticoreParam(num_cores)))
      } else { # for windows
      BiocParallel::register(BiocParallel::bpstart(BiocParallel::SnowParam(num_cores)))
      }
  }
  
  raw_data <- suppressMessages(readMSData(files = files, pdata = new("NAnnotatedDataFrame", pd),
                                          mode = "onDisk")) 
  
  if(plot.opts=="default"){
    #subset raw_data to first 50 samples
    print("To reduce memory usage BPIS and TICS plots will be created using only 10 samples per group.")
    
    grp_nms <- names(table(pd$sample_group))
    files <- NA
    
    for(i in 1:length(grp_nms)){
      numb2ext <- min(table(pd$sample_group)[i], 10)
      filt_df <- pd[pd$sample_group==grp_nms[i],]
      files.inx <- sample(nrow(filt_df), numb2ext)
      sel.samples <- filt_df$sample_name[files.inx]
      files <- c(files, which(pd$sample_name %in% sel.samples))
    }
    
    raw_data_filt <- filterFile(raw_data, file=na.omit(files));
  
  }else{
    raw_data_filt <- raw_data; # just for plotting
  }
  
  if(plot==TRUE){
    
    if(plot.opts=="all"){
      h <- readline(prompt="Using all samples to create BPIS and TICS plots may cause severe memory issues! Press [0] to continue, or [1] to cancel: ")
      h <- as.integer(h)
      if(h==1){
        print("ImportRawMSData function aborted!")
        return(0)
      }
    }
    
    print("Plotting BPIS and TICS.")
    
    # Plotting functions to see entire chromatogram
    bpis <- chromatogram(raw_data_filt, aggregationFun = "max")
    tics <- chromatogram(raw_data_filt, aggregationFun = "sum")
    
    groupNum <- nlevels(groupInfo)
    
    if(groupNum > 9){
      col.fun <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(12, "Set3"))
      group_colors <- col.fun(groupNum)
    }else{
      group_colors <- paste0(RColorBrewer::brewer.pal(9, "Set1")[1:groupNum], "60")
    }
    
    names(group_colors) <- levels(groupInfo)
    
    bpis_name <- paste("BPIS_", dpi, ".", format, sep="");
    tics_name <- paste("TICS_", dpi, ".", format, sep="");
    
    Cairo::Cairo(file = bpis_name, unit="in", dpi=dpi, width=width, height= width*5/9, 
                 type=format, bg="white");
    plot(bpis, col = group_colors[raw_data_filt$sample_group])
    legend("topright", legend=levels(groupInfo), pch=15, col=group_colors);
    dev.off();
    
    Cairo::Cairo(file = tics_name, unit="in", dpi=dpi, width=width, height=width*5/9, 
                 type=format, bg="white");
    plot(tics, col = group_colors[raw_data_filt$sample_group])
    legend("topright", legend=levels(groupInfo), pch=15, col=group_colors);
    dev.off();
  }
  
  print("Successfully imported raw MS data!")
  
  return(raw_data)
}

#' Set class information for MS data
#' @description This function sets the class information
#' for preprocessing MS data.
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export
SetClass <- function(class){
  groupInfo <<- class
}

#' Plot EIC
#' @description This functionn creates an extracted ion chromatogram (EIC) for a specific
#' m/z and retention time. This is used for quality-control of raw m/s data.
#' @param raw_data The object created using the InspectRawMSData function,
#' containing the raw MS data.
#' @param rt_mn Numeric, specify the minimum bound of the retention time range.
#' @param rt_mx Numeric, specify the maximum bound of the retention time range.
#' @param mz_mn Numeric, specify the minimum bound of the m/z range.
#' @param mz_mx Numeric, specify the maximum bound of the m/z range.
#' @param aggreg Character, if "sum", creates a total ion chromatogram. 
#' If "max", creates a base peak chromatogram. By default it is set 
#' to "sum". 
#' @param format Character, input the format of the image to create.
#' @param dpi Numeric, input the dpi of the image to create.
#' @param width Numeric, input the width of the image to create. 
#' @export

PlotEIC <- function(raw_data, rt_mn, rt_mx, mz_mn, mz_mx, aggreg = "sum",
                    format = "png", dpi = 72, width = 9){
  
  filt_data <- filterRt(raw_data, rt = c(rt_mn, rt_mx))
  filt_mz <- filterMz(filt_data, mz = c(mz_mn, mz_mx))
  mz_slice <- chromatogram(filt_mz, aggregationFun = aggreg)
  
  groupNum <- nlevels(groupInfo)
  
  if(groupNum > 9){
    col.fun <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(12, "Set3"))
    group_colors <- col.fun(groupNum)
  }else{
    group_colors <- paste0(RColorBrewer::brewer.pal(9, "Set1")[1:groupNum], "60")
  }
  
  names(group_colors) <- unique(raw_data$sample_group)
  
  eic_name <- paste("EIC_", dpi, ".", format, sep="");
  
  Cairo::Cairo(file = eic_name, unit="in", dpi=dpi, width=width, height= width*5/9, 
               type=format, bg="white");
  plot(mz_slice, col = group_colors[raw_data$sample_group])
  legend("topright", legend=unique(raw_data$sample_group), pch=15, col=group_colors);
  dev.off();
  
  print("EIC created!") 
  return(1)
}

#' Set parameters for peak picking using XCMS and CAMERA
#' @description This function sets all the parameters used for downstream
#' pre-processing of user's raw MS data. 
#' @param alg Character, specify the algorithm to perform peak detection. "centwave" 
#' to use the CentWave algorithm, and "match_filt" to use the MatchedFilter algorithm.
#' @param ppm Numeric, specify the mass error in ppm.
#' @param min_pkw Numeric, specify the minimum peak width in seconds.
#' @param max_pkw Numeric, specify the maximum peak width in seconds. 
#' @param sn_thresh Numeric, specify the signal to noise threshold.
#' @param mzdiff Numeric, specify the minimum m/z difference for signals to be considered as 
#' different features when retention times are overlapping. 
#' @param bw Numeric, specify the band width (sd or half width at half maximum) of gaussian 
#' smoothing kernel to be applied during peak grouping.
#' @param min_frac Numeric, specify fraction of samples in each group that contain the feature for it to be grouped.
#' @param min_sample_num Numeric, specify minimum number of sample(s) in each group that contain the feature for it to be included.
#' @param max_feats Numeric, specify the maximum number of features to be identified.
#' @param peakgroup Boolean, if true, PeakGroup algorithm is used for peak alignment; if false, Obiwarp method is used. 
#' @param bin_size Numeric, specify the bin size (in m/z) to be used for the profile matrix generation used for peak alignment (Obiwarp method).
#' @param min_frac_retcor Numeric, specify fraction of samples in all groups that contain the peaks for them to be aligned (PeakGroup method).
#' @param rt_filt Boolean, if true, users must specify the minimum and maximum retention
#' time to be included in the analysis. By default this is set to 200 - 1000. 
#' @param rt_min Numeric, specify the minimum retention time.
#' @param rt_max Numeric, specify the maximum retention time.
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export
SetPeakParam <- function(alg = "centwave", ppm = 10, min_pkw = 10, 
                         max_pkw = 60, sn_thresh = 6, mzdiff = 0.01, bw = 5,
                         min_frac = 0.5, min_sample_num = 1,
                         max_feats = 100, 
                         peakgroup = FALSE,
                         bin_size = 1, min_frac_retcor = 0.9,
                         rt_filt = FALSE, 
                         rt_min = 200, rt_max = 1000){
  
  peakParams <- list()
  
  peakParams$algorithm <- alg
  peakParams$polarity <- polarity
  peakParams$ppm <- ppm
  peakParams$min_pkw <- min_pkw
  peakParams$max_pkw <- max_pkw
  peakParams$sn_thresh <- sn_thresh
  peakParams$mzdiff <- mzdiff
  peakParams$bw <- bw
  peakParams$min_frac <- min_frac
  peakParams$min_sample_num <- min_sample_num
  peakParams$max_feats <- max_feats
  peakParams$peakgroup <- peakgroup
  peakParams$bin_size <- bin_size
  peakParams$min_frac_retcor <- min_frac_retcor
  peakParams$rt_filt <- rt_filt
  peakParams$rt_min <- rt_min
  peakParams$rt_max <- rt_max
  
  return(peakParams)
}

#' Perform peak annotation
#' This function performs feature extraction of user's raw MS data using 
#' the rawData object created using the InspectRawMSData function.
#' @param rawData The object created using the InspectRawMSData function,
#' containing the raw MS data.
#' @param peakParams The object created using the SetPeakParam function, 
#' containing user's specified or default parameters for downstream 
#' raw MS data pre-processing.
#' @param rtPlot Logical, if true creates a plot of retention time correction.
#' Defaut is set to true.
#' @param pcaPlot Logical, if true creates a PCA plot to evaluate the sample grouping.
#' Default is set to true.
#' @param labels Logical, if true, the PCA plot will be annotated with sample names.
#' @param format Character, input the format of the image to create.
#' @param dpi Numeric, input the dpi of the image to create.
#' @param width Numeric, input the width of the image to create.
#' @param rtplot_name Character, input the name of the RT adjustment image to create.
#' @param pcaplot_name Character, input the name of the PCA image to create.
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export
#' @import xcms
#' @import ggplot2
PerformPeakProfiling <- function(rawData, peakParams, rtPlot = TRUE, pcaPlot = TRUE,
                                 labels = TRUE, format = "png", dpi = 72, width = 9, rtplot_name="RT_adjustment", pcaplot_name="PCA_plot"){
  if(.on.public.web){
    load_ggplot()
    load_xcms()
  }

  if(rtplot_name=="RT_adjustment"){
    rtplot_name <- paste("RT_adjustment", dpi, ".", format, sep="")
  }
  if(pcaplot_name=="PCA_plot"){
    pcaplot_name <- paste("PCA_plot", dpi, ".", format, sep="")
  }
  
  # First check if data should be filtered by RT
  if(peakParams$rt_filt == TRUE){
    rtmin <- peakParams$rt_min
    rtmax <- peakParams$rt_max
    filt_data <- filterRt(rawData, rt = c(rtmin, rtmax))
  }else{
    filt_data <- rawData
  }
  
  # Peak picking
  
  print("Step 1/3: Started peak picking! This step will take some time...")
  
  if(peakParams$algorithm == "centwave"){
    cwp <- CentWaveParam(ppm = peakParams$ppm, peakwidth = c(peakParams$min_pkw, peakParams$max_pkw),
                         snthresh = peakParams$sn_thresh, mzdiff = peakParams$mzdiff) 
    xdata <- findChromPeaks(filt_data, param = cwp)
  }else{
    mfp <- MatchedFilterParam(binSize = peakParams$bin_size, mzdiff = peakParams$mzdiff)
    xdata <- findChromPeaks(filt_data, param = mfp)
  }
  
  print("Step 1/3: Successfully performed peak picking!")
  
  # Peak alignment (group, RT correction, group, fill in missing peaks)
  gdp <- PeakDensityParam(sampleGroups = xdata$sample_group, bw = peakParams$bw, 
                          minFraction = peakParams$min_frac, minSamples = peakParams$min_sample_num,
                          maxFeatures = peakParams$max_feats)
  
  grouped_xdata <- groupChromPeaks(xdata, param = gdp)
  
  ## RT correction 
  if(peakParams$peakgroup == TRUE){
    rt_xdata <- adjustRtime(grouped_xdata, param = PeakGroupsParam(minFraction = peakParams$min_frac_retcor))
  }else{
    rt_xdata <- adjustRtime(grouped_xdata, param = ObiwarpParam(binSize = peakParams$bin_size))
  }
  
  print("Step 2/3: Successfully performed retention time adjustment!")
  
  groupNum <- nlevels(groupInfo)
  
  if(rtPlot == TRUE){
    Cairo::Cairo(file = rtplot_name, unit="in", dpi=dpi, width=width, height=width*5/9, 
                 type=format, bg="white")
    if(groupNum > 9){
      col.fun <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(12, "Set3"))
      group_colors <- col.fun(groupNum)
    }else{
      group_colors <- paste0(RColorBrewer::brewer.pal(9, "Set1")[1:groupNum], "60")
    }
    names(group_colors) <- unique(rt_xdata$sample_group)
    # Different color parameters required for RT correction plot from Obiwarp and PeakGroup methods.
    if(peakParams$peakgroup == TRUE){
      plotAdjustedRtime(rt_xdata, peakGroupsCol = group_colors[rt_xdata$sample_group])
      legend("topright", legend=unique(rt_xdata$sample_group), pch=15, col=group_colors);
    }else{
      plotAdjustedRtime(rt_xdata, col = group_colors[rt_xdata$sample_group])
      legend("topright", legend=unique(rt_xdata$sample_group), pch=15, col=group_colors);
    }
    dev.off()
  }
  
  grouped_xdata2 <- groupChromPeaks(rt_xdata, param = gdp)
  filled <- fillChromPeaks(grouped_xdata2)
  
  if(pcaPlot == TRUE){
    Cairo::Cairo(file = pcaplot_name, unit="in", dpi=dpi, width=width, height=width*6/9, 
                 type=format, bg="white");
    par(mfrow=c(1,2));
    
    pca_feats <- log2(featureValues(grouped_xdata2, value = "into"))
    xdata_pca <- prcomp(t(na.omit(pca_feats)), center = TRUE, scale=T)
    sum.pca <- summary(xdata_pca)
    var.pca <- sum.pca$importance[2,] # variance explained by each PCA
    
    xlabel <- paste("PC1", "(", round(100*var.pca[1],1), "% variance)");
    ylabel <- paste("PC2", "(", round(100*var.pca[2],1), "% variance)");
    
    # using ggplot2
    df <- as.data.frame(xdata_pca$x)
    df$group <- grouped_xdata2$sample_group

    if(labels==TRUE){
      
      if(groupNum>9){
        col.fun <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(12, "Set3"))
        p <- ggplot2::ggplot(df, aes(x = PC1, y = PC2, color=group, label=row.names(df))) + geom_text() + geom_point(size = 3) + fill=col.fun(groupNum)
      }else{
        p <- ggplot2::ggplot(df, aes(x = PC1, y = PC2, color=group, label=row.names(df))) + geom_text() + geom_point(size = 3) + scale_color_brewer(palette="Set1")
      }
      
    }else{
      if(groupNum>9){
        col.fun <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(12, "Set3"))
        p <- ggplot2::ggplot(df, aes(x = PC1, y = PC2, color=group)) + geom_point(size = 3) + scale_color_brewer(palette="Set1") + fill=col.fun(groupNum)
      }else{
        p <- ggplot2::ggplot(df, aes(x = PC1, y = PC2, color=group)) + geom_point(size = 3) + scale_color_brewer(palette="Set1")
      }
    }
    
    p <- p + xlab(xlabel) + ylab(ylabel) + theme_bw()
    print(p)
    dev.off();
  }
  
  xset <- as(filled, "xcmsSet")
  
  print("Step 3/3: Successfully performed peak alignment!")
  return(xset)
}


#' Set annotation parameters
#' @description This function sets the parameters for peak annotation.
#' @param polarity Character, specify the polarity of the MS instrument. Either
#' "negative" or "positive".
#' @param perc_fwhm Numeric, set the percentage of the width of the FWHM for peak grouping. 
#' Default is set to 0.6.
#' @param mz_abs_iso Numeric, set the allowed variance for the search (for isotope annotation).
#' The default is set to 0.005.
#' @param max_charge Numeric, set the maximum number of the isotope charge. For example,
#' the default is 2, therefore the max isotope charge is 2+/-. 
#' @param max_iso Numeric, set the maximum number of isotope peaks. For example, the default
#' is 2, therefore the max number of isotopes per peaks is 2.
#' @param corr_eic_th Numeric, set the threshold for intensity correlations across samples.
#' Default is set to 0.85.
#' @param mz_abs_add Numeric, set the allowed variance for the search (for adduct annotation).
#' The default is set to 0.001.
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export
SetAnnotationParam <- function(polarity = "positive", perc_fwhm = 0.6, mz_abs_iso = 0.005,
                               max_charge = 2, max_iso = 2, corr_eic_th = 0.85,
                               mz_abs_add = 0.001){
  
  annParams <- list()
  
  annParams$polarity <- polarity
  annParams$perf.whm <- perc_fwhm
  annParams$mz.abs.iso <- mz_abs_iso
  annParams$max.charge <- max_charge
  annParams$max.iso <- max_iso
  annParams$corr.eic.th <- corr_eic_th
  annParams$mz.abs.add <- mz_abs_add
  
  return(annParams)
}

#' Perform peak annotation
#' @description This function performs peak annotation on
#' the xset object created using the PerformPeakPicking function.
#' @param xset The object created using the PerformPeakPicking function,
#' containing the peak picked MS data.
#' @param annParams The object created using the SetAnnotationParam function, 
#' containing user's specified or default parameters for downstream 
#' raw MS data pre-processing.
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export
#' @import CAMERA
PerformPeakAnnotation <- function(xset, annParams){
  
  if(.on.public.web){
    load_camera()
  }
  
  # Perform peak annotation
  print("Starting peak annotation...")
  xsa <- xsAnnotate(xset)
  xsaF <- groupFWHM(xsa, perfwhm = annParams$perf.whm) 
  xsaFI <- findIsotopes(xsaF, mzabs = annParams$mz.abs.iso, maxcharge = annParams$max.charge,
                        maxiso = annParams$max.iso)
  xsaC <- groupCorr(xsaFI, cor_eic_th = annParams$corr.eic.th) 
  xsaFA <- findAdducts(xsaC, polarity = annParams$polarity, mzabs = annParams$mz.abs.add)
  
  # Edit CAMERA peak list to add sample names
  camera_output <- getPeaklist(xsaFA)
  sample_names <- xsaFA@xcmsSet@phenoData[[1]]
  sample_names_ed <- gsub(".mzML|.CDF|.mzXML", "", sample_names) 
  
  # Account for multiple groups
  length <- ncol(camera_output)
  end <- length-3
  camnames <- colnames(camera_output)
  groupNum <- nlevels(groupInfo)
  start <- groupNum+8
  camnames[start:end] <- sample_names_ed
  colnames(camera_output) <- camnames
  
  endGroup <- 7+groupNum
  camera_output <- camera_output[,-c(7:endGroup)]
  
  saveRDS(camera_output, "annotated_peaklist.rds")
  write.csv(camera_output, "annotated_peaklist.csv")
  
  print("Successfully performed peak annotation!")
  return(xsaFA)
}

#' Format Peak List
#' @description This function formats the CAMERA output to a usable format for MetaboAanlyst.
#' @param annotPeaks The object created using the PerformPeakAnnotation.
#' @param annParams The object created using the SetAnnotationParam function, 
#' containing user's specified or default parameters for downstream 
#' raw MS data pre-processing.
#' @param filtIso Logical, filter out all isotopes except for [M]+ for 
#' positive ion mode and [M]- for negative ion mode. By default it is 
#' set to true.
#' @param filtAdducts Logical, filter out all adducts except [M+H]+ for  
#' positive ion more and [M-H]- for negative ion mode. By default it is set to false.
#' @param missPercent Numeric, specify the threshold to remove features
#' missing in X\% of samples. For instance, 0.5 specifies to remove features
#' that are missing from 50\% of all samples per group. Method is only valid
#' when there are two groups.
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export

FormatPeakList <- function(annotPeaks, annParams, filtIso = TRUE, filtAdducts = FALSE, missPercent = 0.5){
  
  camera_output <- readRDS("annotated_peaklist.rds")
  
  length <- ncol(camera_output)
  end <- length-3
  
  # Format peaklist for MetaboAnalyst
  camera_ma <- camera_output[, -length]
  
  if(filtAdducts==TRUE){
    if(annParams$polarity=="positive"){
      
      if(filtIso==TRUE){
        camera_isotopes <- camera_ma[grepl("\\[M\\]\\+", camera_ma$isotopes),]
      }else{
        camera_isotopes <- camera_ma[(camera_ma$isotopes != ""),]
      }
      camera_adducts <- camera_ma[grepl("\\[M\\+H\\]\\+", camera_ma$adduct),]
      camera_feats <- camera_ma[(camera_ma$isotopes == "" & camera_ma$adduct == ""),]
      unique_feats <- unique(rbind(camera_isotopes, camera_adducts, camera_feats))
      
    }else{ # negative polarity
      if(filtIso==TRUE){
        camera_isotopes <- camera_ma[grepl("\\[M\\]-", camera_ma$isotopes),]
      }else{
        camera_isotopes <- camera_ma[(camera_ma$isotopes != ""),]
      }
      camera_adducts <- camera_ma[grepl("\\[M-H\\]-", camera_ma$adduct),]
      camera_feats <- camera_ma[(camera_ma$isotopes == "" & camera_ma$adduct == ""),]
      unique_feats <- unique(rbind(camera_isotopes, camera_adducts, camera_feats))
    }
  }else{
    if(annParams$polarity=="positive"){
      
      if(filtIso==TRUE){
        camera_isotopes <- camera_ma[grepl("\\[M\\]\\+", camera_ma$isotopes),]
      }else{
        camera_isotopes <- camera_ma[(camera_ma$isotopes != ""),]
      }
      camera_adducts <- camera_ma[(camera_ma$adduct != ""),]
      camera_feats <- camera_ma[(camera_ma$isotopes == ""),]
      unique_feats <- unique(rbind(camera_isotopes, camera_adducts, camera_feats))
      
    }else{ # negative polarity
      
      if(filtIso==TRUE){
        camera_isotopes <- camera_ma[grepl("\\[M\\]-", camera_ma$isotopes),]
      }else{
        camera_isotopes <- camera_ma[(camera_ma$isotopes != ""),]
      }
      camera_adducts <- camera_ma[(camera_ma$adduct != ""),]
      camera_feats <- camera_ma[(camera_ma$isotopes == ""),]
      unique_feats <- unique(rbind(camera_isotopes, camera_adducts, camera_feats))
    }
  }
  
  # adjust decimal places, feats_info contains all samples
  feats_info <- unique_feats[,7:end]  
  feats_digits <- round(feats_info, 5) 
  
  group_info <- annotPeaks@xcmsSet@phenoData[[2]]
  combo_info <- rbind(as.character(group_info), feats_digits)
  
  mzs_rd <- round(unique_feats[,1], 5)
  mzs <- data.frame(c("Label", mzs_rd), stringsAsFactors = FALSE)
  
  # ensure features are unique
  mzs_unq <- mzs[duplicated(mzs),] 
  
  if(length(mzs_unq)>0){
    mzs[duplicated(mzs),] <- sapply(mzs_unq, function(x) paste0(x, sample(1:999, 1, replace = FALSE)))
  }
  
  colnames(mzs) <- "Sample" 
  ma_feats <- cbind(mzs, combo_info)
  
  # remove features missing in over X% of samples per group
  # only valid for 2 group comparisons!!
  ma_feats_miss <- ma_feats[which(rowMeans(is.na(ma_feats[,(ma_feats[1,]==as.character(unique(group_info[1])))])) 
                                  | rowMeans(is.na(ma_feats[,(ma_feats[1,]==as.character(unique(group_info[2])))])) < missPercent), ]
  
  write.csv(ma_feats_miss, "metaboanalyst_input.csv", row.names = FALSE)
  
  # provide index for CAMERA output
  Pklist_inx <- row.names(ma_feats_miss)
  ma_feats_miss_inx <- cbind(ma_feats_miss, Pklist_inx) 
  
  write.csv(ma_feats_miss_inx, "filtered_peaklist.csv", row.names = FALSE)
  
  return(ma_feats_miss)
}

###
### Functions for MS2 Data
###

#' Extract MS2 Data
#' @description This function returns a list of spectra that matches with 
#' a user specified precursor m/z. 
#' @param filename Name of the file (e.g. mzML, mzXML)
#' @param peakParams Object containing parameters for peak picking.
#' @param mzmin Minimum m/z when selecting a precursor from peak list 
#' @param mzmax Maximum m/z when selecting a precursor from peak list 
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export
#' @import xcms

ExtractMS2data <- function(filename, peakParams, mzmin, mzmax){
  
  ## Function to extract MSMS spectra 
  xrms <- readMSData(filename, mode = "onDisk", centroided = TRUE)
  # get all MS2 spectra from DDA experiment
  ms2spectra <- spectra(filterMsLevel(xrms, msLevel = 2))  
  
  ## Detect chromatographic peaks 
  # set parameters and find chromatographic peaks 
  ms1cwp <- CentWaveParam(snthresh = peakParams$sn_thresh, 
                          noise = 100, ppm = peakParams$ppm, peakwidth = c(peakParams$min_pkw, peakParams$max_pkw))
  ms1data <- findChromPeaks(xrms, param = ms1cwp, msLevel = 1)
  
  #get all peaks 
  chromPeaks <- chromPeaks(ms1data)

  # pick one compound 
  ## find which row contains the mass we want
  chp <- as.data.frame(chromPeaks)
  rownum <- which(chp$mz >= mzmin & chp$mz <= mzmax)
  chromPeak <- chromPeaks[rownum,]
  
  #filter out fitting spectra
  filteredMs2spectra <- .getDdaMS2Scans(chromPeak, ms2spectra)
  
  return(filteredMs2spectra)
}

### Helper function
### function for DDA data 
### Credit: Michael Witting; https://github.com/michaelwitting/metabolomics2018/blob/master/R/msLevelMerge.R
### MS2 spectra need to be list of spectrum2 objects
.getDdaMS2Scans <- function(chromPeak, ms2spectra, mzTol = 0.01, rtTol = 20) {
  
  #make a new list
  filteredMs2spectra <- list()
  
  #isolate only the ones within range
  for(i in 1:length(ms2spectra)) {
    
    #isolate Ms2 spectrum
    ms2spectrum <- ms2spectra[[i]]
    
    #check if within range of peak
    if(abs(chromPeak[4] - ms2spectrum@rt) < rtTol & abs(chromPeak[1] - ms2spectrum@precursorMz) < mzTol) {
      filteredMs2spectra <- c(filteredMs2spectra, ms2spectrum)
    }
  }
  
  #return list with filtered ms2 spectra
  return(filteredMs2spectra)
}

#' Plot selected M2 spectra for an m/z feature
#' @description This function creates a plot of the user selected precursor m/z.
#' @param ms2 Spectrum2 class object containing all of the spectra for the selected
#' m/z feature.
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export
#' @import xcms

PlotMS2Spectra <- function(ms2, spectra = 1){
  
  MSnbase::plot(ms2[[spectra]])
  
}  
simscr/metaboanalyst documentation built on Jan. 21, 2020, 12:13 a.m.