R/Spectra_import.R

Defines functions DetectMS2Design SpectraClean SanitySpectra formatFileSpectrumNames isCdfFile setCacheEnv testCacheArg .testReadMSDataInput Path2Files .CentroidSpectra CentroidMSData CentroidCheck UpdateRawfiles read.OnDiskMS.data read.InMemMSd.data read.MSdata ImportRawMSData InitDataObjects

Documented in CentroidCheck CentroidMSData ImportRawMSData InitDataObjects UpdateRawfiles

#'@title InitDataObjects
#'@description This functions handles the construction of a mSetObj object for storing data for further processing and analysis.
#'It is necessary to utilize this function to specify to MetaboAnalystR the type of data and the type of analysis you will perform. 
#'@usage InitDataObjects(data.type, anal.type, paired=FALSE)
#'@param data.type The type of data, either list (Compound lists), conc (Compound concentration data), 
#'specbin (Binned spectra data), pktable (Peak intensity table), nmrpeak (NMR peak lists), mspeak (MS peak lists), 
#'or msspec (MS spectra data)
#'@param anal.type Indicate the analysis module to be performed: stat, pathora, pathqea, msetora, msetssp, msetqea, ts, 
#'cmpdmap, smpmap, or pathinteg
#'@param paired Indicate if the data is paired or not. Logical, default set to FALSE
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#' @return will initialize an mSet object
#' @export
#' @import methods
#' @import BiocParallel
#' @importFrom  Cairo CairoFonts
#' @examples 
#' mSet<-InitDataObjects("spec", "raw", FALSE)

InitDataObjects <- function(data.type, anal.type, paired=FALSE){
  
  if(anal.type == "raw" & data.type == "spec") {
    MessageOutput("OptiLCMS R objects initialized ...\n", SuppressWeb = TRUE);
    return(new("mSet"))
  }
  
}

#' @title Import raw MS data
#' @description This function handles the reading in of
#' raw MS data (.mzML, .CDF and .mzXML). Users must set
#' their working directory to the folder containing their raw
#' data, divided into two subfolders named their desired group labels. The
#' function will output two chromatograms into the user's working directory, a
#' base peak intensity chromatogram (BPIC) and a total ion
#' chromatogram (TIC). Further, this function sets the number of cores
#' to be used for parallel processing. It first determines the number of cores
#' within a user's computer and then sets it that number/2.
#' @param mSet mSet Object, can be optional. Usually generated by InitDataObjects("spec", "raw", FALSE) before the data import.
#' @param path Character, input the path to the folder containing
#' the raw MS spectra to be processed. Or a character vector containing all raw files absolute paths.
#' @param metadata Data.frame or character. A phenotype data frame or a absolute path of the metadata file (.txt) for all samples, optional. 
#' In the option, first column should be the sample name, while second column is the corresponding group name. If ommited, all samples in the same sub-folder will be 
#' considered as one group.
#' @param mode Character, the data input mode. Default is "onDisk" to avoid memory crash. "inMemory" will
#' read data into memory.
#' @param plotSettings List, plotting parameters produced by SetPlotParam Function. "plot.opts" can be added through this
#' function for samples numbers for plotting. Defalut is "default", "all" will apply all samples for plotting and may cause
#' memory crash, especially for large sample dataset.
#' @param running.controller The resuming pipeline running controller. Optional. Don't need to define by hand.
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}, Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @export
#' @import MSnbase
#' @import BiocParallel
#' @import parallel
#' @import RColorBrewer
#' @importFrom tools file_path_as_absolute file_ext
#' @importFrom Cairo Cairo
#' @importFrom stats na.omit
#' @importFrom graphics legend
#' @importFrom grDevices dev.off
#' @return will return a mSet object will raw data read inside.
#' @examples 
#' ##' Get raw spectra files
#' DataFiles <- dir(system.file("mzData", package = "mtbls2"), full.names = TRUE,
#'                  recursive = TRUE)[c(10:12, 14:16)]
#' ##' Create a phenodata data.frame
#' pd <- data.frame(sample_name = sub(basename(DataFiles), pattern = ".mzData",
#'                                    replacement = "", fixed = TRUE),
#'                  sample_group = c(rep("col0", 3), rep("cyp79", 3)),
#'                  stringsAsFactors = FALSE)
#' ##' Import raw spectra
#' mSet <- ImportRawMSData(path = DataFiles, metadata = pd);


ImportRawMSData <-
  function(mSet = NULL,
           path = getwd(),
           metadata = NULL,
           mode = "onDisk",
           plotSettings = SetPlotParam(),
           running.controller = NULL) {
    
    #Initialize a new mSet if missing
    if(missing(mSet) & !.on.public.web()){
      message("No initialized mSet found, will initialize one automatically !")
      mSet <- new("mSet")
    } else if(is.null(mSet) & !.on.public.web()){
      message("Not a real initialized mSet found, will re-initialize one automatically !")
      mSet <- new("mSet")
    } else if(.on.public.web() & file.exists("mSet.rda")) {
      load("mSet.rda")
    } else {
      message("Something wierd happened, don't worry. We will re-initialize one automatically !")
      mSet <- new("mSet")
    }
    
    #Build Running plan for data import - Indentify the controller
    if (is.null(running.controller)) {
      c1 <- TRUE;
      c2 <- TRUE;
      plan_switch <- FALSE;
    } else {
      plan_switch <- TRUE;
      c1 <-
        running.controller@data_import[["c1"]] # used to control data import
      c2 <-
        running.controller@data_import[["c2"]] # used to control plotting option
    }
    
    if(!exists(".SwapEnv")){
      .SwapEnv <<- new.env(parent = .GlobalEnv);
      .SwapEnv$.optimize_switch <- FALSE;
      .SwapEnv$count_current_sample <- 0;
      .SwapEnv$count_total_sample <- 120; # maximum number for on.public.web
      .SwapEnv$envir <- new.env();
    }
    
    start.time <- Sys.time()
    
    ## Deal with the different file path cases
    if(.on.public.web() && !dir.exists(path)){
      path <- "/home/glassfish/projects/MetaboDemoRawData/upload/"
    }
    
    files <- Path2Files(path = path);
    
    if (length(files) == 0) {
      MessageOutput(
        mes = paste0(
          "<font color=\"red\">",
          "\nERROR: No standard MS file found ! Please check the extension of your data.",
          "</font>"
        ),
        ecol = "\n",
        progress = NULL
      )
      stop()
      
    } else if (length(files) < 3) {
      MessageOutput(
        mes = paste0(
          "<font color=\"red\">",
          "\nERROR: At least 3 samples should be provided.",
          "</font>"
        ),
        ecol = "\n",
        progress = NULL
      )
    }
    
    .SwapEnv$count_total_sample <- length(files);
    .SwapEnv$count_current_sample <- 0;
    toRemove = vector();
    
    # Update first
    if(length(mSet@rawfiles) == 0){
      rawfilenms <- basename(files);
      mSet@rawfiles <- files;
    } else {
      rawfilenms <- basename(mSet@rawfiles);
    }
    
    for (i in seq_along(files)) {
      file = basename(files[i])
      if (!(file %in% rawfilenms)) {
        toRemove = c(toRemove, files[i])
      }
    }
    
    toKeepInx = !(files %in% toRemove);
    files = files[toKeepInx];
    
    ####### -------------- Files ready ------------------#
    
    if(missing(metadata) | is.null(metadata)){
      warning("No metadata provided, all files in one sub-folder will be considered a group!")
    
      snames <- gsub("\\.[^.]*$", "", basename(files))
      # msg <- c(msg, paste("A total of ", length(files), "samples were found."))
      sclass <- gsub("^\\.$", "sample", dirname(files))
      
      scomp <- strsplit(substr(sclass, 1, min(nchar(sclass))), "", fixed = TRUE)
      scomp <- matrix(c(scomp, recursive = TRUE), ncol = length(scomp))
      
      i <- 1
      while (all(scomp[i, 1] == scomp[i, -1]) && i < nrow(scomp)) {
        i <- i + 1
      }
      
      i <-
        min(i, tail(c(0, which(
          scomp[seq_len(i), 1] == .Platform$file.sep
        )), n = 1) + 1)
      
      if (i > 1 && i <= nrow(scomp)) {
        sclass <- substr(sclass, i, max(nchar(sclass)))
      }
      
      if (.on.public.web() &
          unique(sclass)[1] == "upload" &
          length(unique(sclass)) == 1) {
        sclass <- rep("Unknown", length(sclass))
      }

    } else if(is.data.frame(metadata)) {
      metadata <- metadata;
    } else if(is.character(metadata)){
      metadata <- read.table(metadata)
    }
    
    filesNM <- sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(files));
    
    if (!is.null(metadata)){
      
      if(!all(basename(files) %in% metadata[,1])){
        if(!all(basename(filesNM) %in% metadata[,1])){
          warning("More files detected than metadata, will omit the extra files not in metadata!")
        }
      }
      
      snames <- metadata[metadata[,1] %in% c(basename(files), filesNM), 1]
      sclass <- metadata[metadata[,1] %in% c(basename(files), filesNM), 2]
      
    }

    if(length(snames) == 0 | length(sclass) == 0){
      stop("No correct sample/sclass found!")
    }
    
    # remove all MS2 files when doing MS1 data processing
    if(any(sclass=="MS2")){
      ms2_idx <- (sclass=="MS2");
      sclass <- sclass[!ms2_idx]
      snames <- snames[!ms2_idx]
      file_ms1_idx <- vapply(files, FUN = function(x){
        res <- vapply(snames, function(y){grepl(y, x)}, FUN.VALUE = logical(1L))
        any(res)
      }, FUN.VALUE = logical(1L))
      files <- files[file_ms1_idx]
    }
    
    pd <- data.frame(
      sample_name = snames,
      sample_group = sclass,
      stringsAsFactors = FALSE
    )
    
    if(.SwapEnv$.optimize_switch){
      MessageOutput(mes = "\nStep 2/6: Start to import the spectrum! \nThis step will take a short time...",
                    ecol = "\n",
                    progress = 21.0)
    } else {
      MessageOutput(mes = "\nStep 1/6: Start to import the spectrum! \nThis step will take a short time...",
                    ecol = "\n",
                    progress = 10.0)
    }

    
    if (c1) {
      raw_data <-
        tryCatch(read.MSdata(
          files = files,
          pdata = new("NAnnotatedDataFrame", pd),
          mode = mode,
          msLevel. = 1
        ), error = function(e) {e})
      
      if (!(is(raw_data,"OnDiskMSnExp") | is(raw_data,"MSnExp"))) {
        MessageOutput(
          mes = paste0(
            "<font color=\"red\">",
            "\nERROR:",
            raw_data$message,
            "</font>"
          ),
          ecol = "\n",
          progress = 1
        )
        stop("EXCEPTION POINT CODE: RMS2")
      }
      
      if(plan_switch){
        cache.save(raw_data, funpartnm = "data_import_c1");
        marker_record("data_import_c1");
      }
 
    } else {
      raw_data <- cache.read ("data_import", "c1")
      marker_record("data_import_1")
    }
    
    raw_data <- SanitySpectra(raw_data, cutoff = 0.05);
    
    MessageOutput(NULL, NULL, 22)
    
    if (c2) {
      if (plotSettings$Plot == TRUE) {
        if (is.null(plotSettings$plot.opts)) {
          plot.opts <- "default"
        } else {
          plot.opts <- plotSettings$plot.opts
        }
        
        if (plot.opts == "default") {
          #subset raw_data to first 50 samples
          MessageOutput("To reduce memory usage BPIS and TICS plots will be created using only 10 samples per group.", SuppressWeb = TRUE)
          
          grp_nms <- names(table(pd$sample_group))
          files <- NA
          
          for (i in seq_along(grp_nms)) {
            numb2ext <- min(table(pd$sample_group)[i], 10)
            filt_df <- pd[pd$sample_group == grp_nms[i], ]
            files.inx <- sample(nrow(filt_df), numb2ext)
            sel.samples <- filt_df$sample_name[files.inx]
            files <-
              c(files, which(pd$sample_name %in% sel.samples))
          }
          
          raw_data_filt <-
            filterFile(raw_data, file = na.omit(files))
          
        } else{
          raw_data_filt <- raw_data
          # just for plotting
        }
        
        if (plot.opts == "all") {
          h <-
            readline(prompt = "Using all samples to create BPIS and TICS plots may cause severe memory issues! Press [0] to continue, or [1] to cancel: ")
          h <- as.integer(h)
          
          if (h == 1) {
            MessageOutput("ImportRawMSData function aborted!\n")
            return(0)
          }
        }
        
        MessageOutput("Plotting BPIS and TICS.")
        # Plotting functions to see entire chromatogram
        bpis <- chromatogram(raw_data_filt, aggregationFun = "max", BPPARAM = MulticoreParam(4))
        tics <- chromatogram(raw_data_filt, aggregationFun = "sum", BPPARAM = MulticoreParam(4))
        groupInfo <-as.factor(pd$sample_group);
        groupNum <- nlevels(groupInfo)
        
        if (groupNum > 9) {
          col.fun <-
            grDevices::colorRampPalette(RColorBrewer::brewer.pal(12, "Set3"))
          group_colors <- col.fun(groupNum)
          
        } else{
          group_colors <-
            paste0(RColorBrewer::brewer.pal(9, "Set1")[seq_len(groupNum)], "60")
        }
        
        names(group_colors) <- levels(groupInfo)
        
        bpis_name <-
          paste("BPIS_",
                plotSettings$dpi,
                ".",
                plotSettings$format,
                sep = "")
        
        tics_name <-
          paste("TICS_",
                plotSettings$dpi,
                ".",
                plotSettings$format,
                sep = "")
        
        if(.on.public.web()){
          save(raw_data_filt, file = "raw_data_filt.rda");
          save(tics, file = "tics.rda");
          save(bpis, file = "bpis.rda");
        }
        
        if (.on.public.web()) {
          Cairo::Cairo(
            file = bpis_name,
            unit = "in",
            dpi = plotSettings$dpi,
            width = 9,
            height = 6.75,
            type = plotSettings$format,
            bg = "white"
          )
        }
        
        plot(bpis, col = group_colors[raw_data_filt$sample_group])
        legend(
          "topright",
          legend = levels(groupInfo),
          pch = 15,
          col = group_colors
        )
        
        if (.on.public.web()) {
          dev.off()
          
          Cairo::Cairo(
            file = tics_name,
            unit = "in",
            dpi = plotSettings$dpi,
            width = 9,
            height = 6.75,
            type = plotSettings$format,
            bg = "white"
          )
        }
        
        plot(tics, col = group_colors[raw_data_filt$sample_group])
        legend(
          "topright",
          legend = levels(groupInfo),
          pch = 15,
          col = group_colors
        )
        
        if (.on.public.web()) {
          dev.off()
        }
        
      }
      if (plan_switch) {
        marker_record("data_import_c2")
      }
    }
    
    if(.SwapEnv$.optimize_switch){
      MessageOutput(
        mes = paste0(
          "Step 2/6: Successfully imported raw MS data! (",
          Sys.time(),
          ") \nGoing to the next step..."
        ),
        ecol = "\n",
        progress = 24
      )
    } else {
      MessageOutput(
        mes = paste0(
          "Step 1/6: Successfully imported raw MS data! (",
          Sys.time(),
          ") \nGoing to the next step..."
        ),
        ecol = "\n",
        progress = 18
      )
    }
    
    if(mode == "onDisk"){
      mSet@rawOnDisk <- raw_data;
    } else if(mode == "inMemory"){
      mSet@rawInMemory <- raw_data;
    }
    
    .SwapEnv$.optimize_switch <- FALSE;
    
    if(.on.public.web()){
      save(mSet, file = "mSet.rda");
    }
    
    return(mSet)
}

read.MSdata <- function(files, 
                        pdata = NULL, 
                        msLevel. = NULL, 
                        centroided. = NA,
                        smoothed. = NA, 
                        cache. = 1L,
                        mode = c("inMemory", "onDisk")) {
  
  mode <- match.arg(mode)
  ## o normalize the file path, i.e. replace relative path with absolute
  ##   path. That fixes possible problems on Windows with SNOW parallel
  ##   processing and also proteowizard problems on unis system with ~ paths.
  files <- normalizePath(files)
  suppressWarnings(.hasChroms <- MSnbase::hasChromatograms(files))
  
  MessageOutput ("Raw file import begin...", "\n", NULL)
  
  if (!length(files)) {
    process <- new("MSnProcess",
                   processing = paste("No data loaded:", date()))
    if (mode == "inMemory")
      res <- new("MSnExp",
                 processingData = process)
    else res <- new("OnDiskMSnExp",
                    processingData = process)
  } else {
    if (mode == "inMemory") {
      if (is.null(msLevel.)) msLevel. <- 2L
      res <- read.InMemMSd.data(files, pdata = pdata, msLevel. = msLevel.,
                                centroided. = centroided., smoothed. = smoothed., cache. = cache.)
    } else { ## onDisk
      res <- read.OnDiskMS.data(files = files, pdata = pdata,
                                msLevel. = msLevel., centroided. = centroided., smoothed. = smoothed.)
    }
  }
  res
}

#' @import utils
read.InMemMSd.data <- function(files, 
                               pdata, 
                               msLevel., 
                               centroided., 
                               smoothed., 
                               cache. = 1) {
  .testReadMSDataInput(environment())
  if (isCdfFile(files)) {
    if(missing(msLevel.)){
      msLevel. <- 1;
    }
    isCDF <- TRUE;
  } else {
    isCDF <- FALSE;
  }
  
  if (msLevel. == 1) cache. <- 0
  msLevel. <- as.integer(msLevel.)
  ## Creating environment with Spectra objects
  assaydata <- new.env(parent = emptyenv())
  ioncount <- c()
  ioncounter <- 1
  filenams <- filenums <- c()
  fullhd2 <- fullhdorder <- c()
  fullhdordercounter <- 1
  .instrumentInfo <- list()
  ## List eventual limitations
  
  ## ## Idea:
  ## ## o initialize a featureData-data.frame,
  ## ## o for each file, extract header info and put that into
  ##      featureData;
  
  count.idx <- 0;
  
  for (f in files) {
    
    filen <- match(f, files)
    filenums <- c(filenums, filen)
    filenams <- c(filenams, f)
    ## issue #214: define backend based on file format.
    msdata <- mzR::openMSfile(f,backend = NULL)
    
    ## Extract Instrument Information
    instruInfo <- try(mzR::instrumentInfo(msdata), silent = TRUE);
    if(is(instruInfo,"list")){
      .instrumentInfo <- c(.instrumentInfo, list(instruInfo));
    } else {
      MessageOutput("Some instrument related information is missing. But the whole process is still running...")
      .instrumentInfo <- c(.instrumentInfo, list());
    }
    
    fullhd <- mzR::header(msdata)
    ## Issue #325: get centroided information from file, but overwrite if
    ## specified with centroided. parameter.
    if (!is.na(centroided.))
      fullhd$centroided <- as.logical(centroided.)
    spidx <- which(fullhd$msLevel == msLevel.)
    ## increase vectors as needed
    ioncount <- c(ioncount, numeric(length(spidx)))
    fullhdorder <- c(fullhdorder, numeric(length(spidx)))
    if (msLevel. == 1) {
      if (length(spidx) == 0)
        stop("No MS1 spectra in file: ",basename(f))
      
      if (.on.public.web()){   
        print_mes <- paste0("Importing ",basename(f),":");    
        write.table(print_mes,file="metaboanalyst_spec_proc.txt",append = TRUE,row.names = FALSE,col.names = FALSE, quote = FALSE, eol = " ");
      } else {
        pb <- progress_bar$new(format = "Reading [:bar] :percent Time left: :eta", 
                               total = length(spidx), clear = TRUE, width= 75)
      }
      
      k_count <- 0;
      if(isCDF){

        Allpeaks <- mzR::peaks(msdata);

        for (i in seq_along(spidx)){
          
          if (!.on.public.web()){   
            pb$tick();
          } else {  
            if (round(i/length(spidx),digits = 4)*100 - k_count > -0.2){
              print_mes <- paste0(k_count,"% | ");    
              write.table(print_mes,file="metaboanalyst_spec_proc.txt",append = TRUE,row.names = FALSE,col.names = FALSE, quote = FALSE, eol = " ");
              k_count <- k_count +20;
            }
          }
          
          j <- spidx[i]
          hd <- fullhd[j, ]
          ## Fix missing polarity from netCDF
          pol <- hd$polarity
          if (length(pol) == 0)
            pol <- NA
          sp <- new("Spectrum1",
                    rt = hd$retentionTime,
                    acquisitionNum = as.integer(hd$acquisitionNum),
                    scanIndex = as.integer(hd$seqNum),
                    tic = hd$totIonCurrent,
                    mz = Allpeaks[[j]][, 1],
                    intensity = Allpeaks[[j]][, 2],
                    fromFile = as.integer(filen),
                    centroided = as.logical(hd$centroided),
                    smoothed = as.logical(smoothed.),
                    polarity = as.integer(pol))
          ## peaksCount
          ioncount[ioncounter] <- sum(Allpeaks[[j]][, 2])
          ioncounter <- ioncounter + 1
          .fname <- formatFileSpectrumNames(
            fileIds = filen,
            spectrumIds = i,
            nSpectra = length(spidx),
            nFiles = length(files)
          )
          assign(.fname, sp, assaydata)
          fullhdorder[fullhdordercounter] <- .fname
          fullhdordercounter <- fullhdordercounter + 1
        }

        rm(Allpeaks);
        
      } else {
     
      for (i in seq_along(spidx)) {
        
        if (!.on.public.web()){   
          pb$tick();
        } else {  
          if (round(i/length(spidx),digits = 4)*100 - k_count > -0.2){
            print_mes <- paste0(k_count,"% | ");    
            write.table(print_mes,file="metaboanalyst_spec_proc.txt",append = TRUE,row.names = FALSE,col.names = FALSE, quote = FALSE, eol = " ");
            k_count <- k_count +20;
          }
        }
        
        j <- spidx[i]
        hd <- fullhd[j, ]
        ## Fix missing polarity from netCDF
        pol <- hd$polarity
        if (length(pol) == 0)
          pol <- NA
        .p <- mzR::peaks(msdata, j)
        sp <- new("Spectrum1",
                  rt = hd$retentionTime,
                  acquisitionNum = as.integer(hd$acquisitionNum),
                  scanIndex = as.integer(hd$seqNum),
                  tic = hd$totIonCurrent,
                  mz = .p[, 1],
                  intensity = .p[, 2],
                  fromFile = as.integer(filen),
                  centroided = as.logical(hd$centroided),
                  smoothed = as.logical(smoothed.),
                  polarity = as.integer(pol))
        ## peaksCount
        ioncount[ioncounter] <- sum(.p[, 2])
        ioncounter <- ioncounter + 1
        .fname <- formatFileSpectrumNames(
          fileIds = filen,
          spectrumIds = i,
          nSpectra = length(spidx),
          nFiles = length(files)
        )
        assign(.fname, sp, assaydata)
        fullhdorder[fullhdordercounter] <- .fname
        fullhdordercounter <- fullhdordercounter + 1
      }
      
      }
      
    } else { ## .msLevel != 1
      if (length(spidx) == 0)
        stop("No MS(n>1) spectra in file", f)
      MessageOutput(paste("Reading ", length(spidx), " MS", msLevel.,
                          " spectra from file ", basename(f),"\n"))
      
      scanNums <- fullhd[fullhd$msLevel == msLevel., "precursorScanNum"]
      if (length(scanNums) != length(spidx))
        stop("Number of spectra and precursor scan number do not match!")
      
      pb <- progress_bar$new(format = "Reading [:bar] :percent Time left: :eta", 
                             total = length(spidx), clear = TRUE, width= 75)
      
      for (i in seq_along(spidx)) {
        
        pb$tick();
        
        j <- spidx[i]
        hd <- fullhd[j, ]
        .p <- mzR::peaks(msdata, j)
        sp <- new("Spectrum2",
                  msLevel = as.integer(hd$msLevel),
                  merged = as.numeric(hd$mergedScan),
                  precScanNum = as.integer(scanNums[i]),
                  precursorMz = hd$precursorMZ,
                  precursorIntensity = hd$precursorIntensity,
                  precursorCharge = as.integer(hd$precursorCharge),
                  collisionEnergy = hd$collisionEnergy,
                  rt = hd$retentionTime,
                  acquisitionNum = as.integer(hd$acquisitionNum),
                  scanIndex = as.integer(hd$seqNum),
                  tic = hd$totIonCurrent,
                  mz = .p[, 1],
                  intensity = .p[, 2],
                  fromFile = as.integer(filen),
                  centroided = as.logical(hd$centroided),
                  smoothed = as.logical(smoothed.),
                  polarity = as.integer(hd$polarity))
        ## peaksCount
        ioncount[ioncounter] <- sum(.p[, 2])
        ioncounter <- ioncounter + 1
        .fname <- formatFileSpectrumNames(
          fileIds = filen,
          spectrumIds = i,
          nSpectra = length(spidx),
          nFiles = length(files)
        )
        assign(.fname, sp, assaydata)
        fullhdorder[fullhdordercounter] <- .fname
        fullhdordercounter <- fullhdordercounter + 1
      }
    }
    
    if (cache. >= 1)
      fullhd2 <- rbind(fullhd2, fullhd[spidx, ])
    
    gc()
    mzR::close(msdata)
    rm(msdata);
    
    if (.on.public.web()){ 
      print_mes <- paste0("Done!");    
      write.table(print_mes,file="metaboanalyst_spec_proc.txt",append = TRUE,row.names = FALSE,col.names = FALSE, quote = FALSE, eol = "\n");
      
      count.idx <- count.idx + 1;  
      write.table(1.0 + count.idx/length(files)*3, file = "log_progress.txt", row.names = FALSE,col.names = FALSE);
    }
    MessageOutput(paste0("Import data: ",basename(f)," finished!"), ecol = "\n")
  }
  
  ## cache level 2 yet implemented
  cache. <- testCacheArg(cache., maxCache = 2)
  if (cache. >= 1) {
    fl <- sapply(assaydata, function(x) x@fromFile)
    featnms <- ls(assaydata) ## feature names in final MSnExp
    fl <- fl[featnms] ## reorder file numbers
    stopifnot(all(base::sort(featnms) == base::sort(fullhdorder)))
    fullhdorder <- match(featnms, fullhdorder)
    tmphd <- fullhd2[fullhdorder, ] ## reorder
    ioncount <- ioncount[fullhdorder]
    newhd <- data.frame(fileIdx = fl,
                        retention.time = tmphd$retentionTime,
                        precursor.mz = tmphd$precursorMZ,
                        precursor.intensity = tmphd$precursorIntensity,
                        charge = tmphd$precursorCharge,
                        peaks.count = tmphd$peaksCount,
                        tic = tmphd$totIonCurrent,
                        ionCount = ioncount,
                        ms.level = tmphd$msLevel,
                        acquisition.number = tmphd$acquisitionNum,
                        collision.energy = tmphd$collisionEnergy)
  } else {
    newhd <- NULL ## not used anyway
  }
  .cacheEnv <- setCacheEnv(list("assaydata" = assaydata,
                                          "hd" = newhd),
                                     cache., lock = TRUE)
  ## CACHING AS BEEN SUPERSEDED BY THE OnDiskMSnExp IMPLEMENTATION
  ## if cache==2, do not lock assign msdata in .cacheEnv then lock
  ## it and do not close(msdata) above; rm(msdata) is OK
  
  ## Create 'MSnProcess' object
  process <- new("MSnProcess",
                 processing = paste("Data loaded:", date()),
                 files = files,
                 smoothed = smoothed.)
  ## Create 'fdata' and 'pdata' objects
  nms <- ls(assaydata)
  if (is.null(pdata)) {
    .pd <- data.frame(sampleNames = basename(files))
    rownames(.pd) <- .pd$sampleNames
    pdata <- new("AnnotatedDataFrame",
                 data = .pd)
  }
  fdata <- new("AnnotatedDataFrame",
               data = data.frame(
                 spectrum = seq_along(nms),
                 row.names = nms))
  fdata <- fdata[ls(assaydata)] ## reorder features
  ## expriment data slot
  if (length(.instrumentInfo) > 1) {
    cmp <- length(unique(sapply(.instrumentInfo, "[[", 1)))
    if (cmp > 1)
      message("According to the instrument information in the files,\n",
              "the data has been acquired on different instruments!")
    for (nm in names(.instrumentInfo[[1]]))
      .instrumentInfo[[1]][[nm]] <- sapply(.instrumentInfo, "[[", nm)
  } else if (length(.instrumentInfo) == 0){
    .instrumentInfo[[1]] <- list(NULL);
    .instrumentInfo[[1]]$manufacturer <- .instrumentInfo[[1]]$model <- 
      .instrumentInfo[[1]]$ionisation <- .instrumentInfo[[1]]$analyzer <- .instrumentInfo[[1]]$detector <- 
      "Unknown";
  }
  expdata <- new("MIAPE",
                 instrumentManufacturer = .instrumentInfo[[1]]$manufacturer,
                 instrumentModel = .instrumentInfo[[1]]$model,
                 ionSource = .instrumentInfo[[1]]$ionisation,
                 analyser = as.character(.instrumentInfo[[1]]$analyzer),
                 detectorType = .instrumentInfo[[1]]$detector)
  ## Create and return 'MSnExp' object
  
  toReturn <- new("MSnExp",
                  assayData = assaydata,
                  phenoData = pdata,
                  featureData = fdata,
                  processingData = process,
                  experimentData = expdata,
                  .cache = .cacheEnv)
  return(toReturn)
}

read.OnDiskMS.data <- function(files, 
                               pdata, 
                               msLevel., 
                               centroided., 
                               smoothed.) {
  
  .testReadMSDataInput(environment())
  stopifnot(is.logical(centroided.))
  
  ## Creating environment with Spectra objects
  assaydata <- new.env(parent = emptyenv())
  filenams <- filenums <- c()
  fullhd2 <- fullhdorder <- c()
  fullhdordercounter <- 1
  .instrumentInfo <- list()
  ## List eventual limitations
  if (isCdfFile(files)) {
    message("You are using CDF files, please make sure they are centroided !")
  }
  ## Idea:
  ## o initialize a featureData-data.frame,
  featureDataList <- list()
  ## o for each file, extract header info and put that into featureData
  ##pb <- progress_bar$new(format = "Reading [:bar] :percent Time left: :eta", 
  #                       total = length(spidx), clear = TRUE, width= 75);
  
  count_mark <- 0;
  
  for (f in files) {
    
    if (.on.public.web()){
      
      print_mes <- paste(basename(f),"import done!");    
      write.table(print_mes,file="metaboanalyst_spec_proc.txt",append = TRUE,row.names = FALSE,col.names = FALSE, quote = FALSE, eol = "\n");
      
    } else {
      #pb$tick();
    }    
    
    filen <- match(f, files)
    filenums <- c(filenums, filen)
    filenams <- c(filenams, f)
    ## issue #214: define backend based on file format.
    msdata <- mzR::openMSfile(f,backend = NULL)
    
    ## Extract Instrument Information
    instruInfo <- try(mzR::instrumentInfo(msdata), silent = TRUE);
    if(is(instruInfo,"list")){
      .instrumentInfo <- c(.instrumentInfo, list(instruInfo));
    } else {
      MessageOutput("Some instrument related information is missing. But the whole process is still running...")
      .instrumentInfo <- c(.instrumentInfo, list());
    }
    
    fullhd <- mzR::header(msdata)
    spidx <- seq_len(nrow(fullhd))
    
    ## Don't read the individual spectra, just define the names of
    ## the spectra.
    fullhdorder <- c(
      fullhdorder,
      formatFileSpectrumNames(
        fileIds = filen,
        spectrumIds = seq_along(spidx),
        nSpectra = length(spidx),
        nFiles = length(files)
      )
    )
    ## Extract all Spectrum info from the header and put it into the featureData
    fdData <- fullhd[spidx, , drop = FALSE]
    ## rename totIonCurrent and peaksCount, as detailed in
    ## https://github.com/lgatto/MSnbase/issues/105#issuecomment-229503816
    names(fdData) <- sub("peaksCount", "originalPeaksCount", names(fdData))
    ## Add also:
    ## o fileIdx -> links to fileNames property
    ## o spIdx -> the index of the spectrum in the file.
    fdData <- cbind(fileIdx = rep(filen, nrow(fdData)),
                    spIdx = spidx,
                    smoothed = rep(as.logical(smoothed.), nrow(fdData)),
                    fdData, stringsAsFactors = FALSE)
    if (isCdfFile(f)) {
      ## Add the polarity columns if missing in netCDF
      if (!any(colnames(fdData) == "polarity"))
        fdData <- cbind(fdData, polarity = rep(as.integer(NA),
                                               nrow(fdData)))
    }
    ## Order the fdData by acquisitionNum to force use of acquisitionNum
    
    ## as unique ID for the spectrum (issue #103). That way we can use
    ## the spIdx (is the index of the spectrum within the file) for
    ## subsetting and extracting.
    if (!all(sort(fdData$acquisitionNum) == fdData$acquisitionNum))
      warning(c("Unexpected acquisition number order detected. ",
                    "\n")) ## see issue #160
    fdData <- fdData[order(fdData$acquisitionNum), ]
    featureDataList <- c(featureDataList, list(fdData))
    ## Fix for #151; would be nice if we could remove that at some point.
    gc()
    mzR::close(msdata)
    rm(msdata)
  }
  ## new in version 1.9.8
  lockEnvironment(assaydata, bindings = TRUE)
  .cacheEnv <- setCacheEnv(list("assaydata" = assaydata,
                                          "hd" = NULL),
                                     level = 0,
                                     lock = TRUE)
  
  ## Create 'MSnProcess' object
  process <- new("MSnProcess",
                 processing = paste0("Data loaded [", date(), "]"),
                 files = files,
                 smoothed = NA)
  ## Create 'fdata' and 'pdata' objects
  if (is.null(pdata)) {
    .pd <- data.frame(sampleNames = basename(files))
    rownames(.pd) <- .pd$sampleNames
    pdata <- new("AnnotatedDataFrame",
                 data = .pd)
  }
  ## If we've got a featureDataList, use it
  if (length(featureDataList) > 0) {
    fdata <- do.call(rbind, featureDataList)
    fdata <- cbind(fdata, spectrum = seq_len(nrow(fdata)),
                   stringsAsFactors = FALSE)
    ## Setting rownames on the data.frame not on the AnnotatedDataFrame;
    ## did get strange errors otherwise.
    rownames(fdata) <- fullhdorder
    ## Re-order them
    fdata <- fdata[base::sort(fullhdorder), ]
    fdata <- new("AnnotatedDataFrame", data = fdata)
    ## Re-order the features.
    ## fdata <- fdata[ls(assaydata), ]
  } else fdata <- new("AnnotatedDataFrame")
  
  ## expriment data slot
  if (length(.instrumentInfo) > 1) {
    cmp <- length(unique(sapply(.instrumentInfo, "[[", 1)))
    if (cmp > 1)
      message("According to the instrument information in the files,\n",
              "the data has been acquired on different instruments!")
    for (nm in names(.instrumentInfo[[1]]))
      .instrumentInfo[[1]][[nm]] <- sapply(.instrumentInfo, "[[", nm)
  } else if (length(.instrumentInfo) == 0){
    .instrumentInfo[[1]] <- list(NULL);
    .instrumentInfo[[1]]$manufacturer <- .instrumentInfo[[1]]$model <- 
      .instrumentInfo[[1]]$ionisation <- .instrumentInfo[[1]]$analyzer <- .instrumentInfo[[1]]$detector <- 
      "Unknown";
  }
  expdata <- new("MIAPE",
                 instrumentManufacturer = .instrumentInfo[[1]]$manufacturer,
                 instrumentModel = .instrumentInfo[[1]]$model,
                 ionSource = .instrumentInfo[[1]]$ionisation,
                 analyser = as.character(.instrumentInfo[[1]]$analyzer),
                 detectorType = .instrumentInfo[[1]]$detector)
  ## Create ProcessingStep if needed.
  ## Create the OnDiskMSnExp object.
  res <- new("OnDiskMSnExp",
             assayData = assaydata,
             phenoData = pdata,
             featureData = fdata,
             processingData = process,
             experimentData = expdata,
             .cache  =  .cacheEnv)
  if (!is.null(msLevel.)) {
    msLevel. <- as.integer(msLevel.)
    res <- filterMsLevel(res, msLevel.)
  }
  if (any(!is.na(centroided.))) {
    if (length(centroided.) == 1) {
      centroided(res) <- centroided.
    } else {
      for (i in seq_along(centroided.))
        centroided(res, msLevel. = i) <- centroided.[i]
    }
  }
  
  if(nrow(res@featureData@data)==0){
    stop(c("None of your spectra contains MS signal at MS level: ", msLevel., "! Please check your data !"))
  }
  
  if (.on.public.web()){
    write.table("Raw file initialized Successfully!",file="metaboanalyst_spec_proc.txt",append = TRUE,row.names = FALSE,col.names = FALSE, quote = FALSE, eol = "\n");
  }
  
  return(res)
}

#' @title UpdateRawfiles
#' @description Update the Raw spectra included for Processing. 
#' All wrong format and uncentroided files will be filtered. 
#' NOTE: this function is only effective before data import stage 
#' AND can NOT be used for resuming pipeline.
#' @param mSet mSet objects generated 
#' with \"mSet <- InitDataObjects(\"spec\", \"raw\", FALSE)\", or omitted.
#' @param filesIncluded filesIncluded is a vector containing the files' 
#' paths for the following processing;
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} and 
#' Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @return will return an mSet object with raw files updated
#' @export
#' @examples 
#' ### Example 1 ---
#' data(mSet)
#' newfiles <- dir(system.file("mzData", package = "mtbls2"), 
#' full.names = TRUE, recursive = TRUE)[c(14:16)]
#' mSet <- UpdateRawfiles(mSet, filesIncluded = newfiles)
#'  
#' ### Example 2 ---
#' ## load googledrive package to download example data
#' # library("googledrive");
#' # data_folder_Sample <- "Raw_data_example"
#' # temp <- tempfile(fileext = ".zip");
#' ## Please authorize the package to download the data from web
#' # dl <- drive_download(as_id("1CjEPed1WZrwd5T3Ovuic1KVF-Uz13NjO"), path = temp, overwrite = TRUE);
#' # out <- unzip(temp, exdir = data_folder_Sample);
#' # out;
#' # mSet<-InitDataObjects("spec", "raw", FALSE);
#' ## include only two samples CD_SM-77FXR.mzML and CD_SM-6KUCT.mzML for data import.
#' # mSet<-UpdateRawfiles(mSet, c("Raw_data_example/CD/CD_SM-77FXR.mzML", 
#' #                      "Raw_data_example/CD/CD_SM-6KUCT.mzML"))

UpdateRawfiles <- function(mSet = NULL, filesIncluded = NULL){
  
  # TODO: to develope a shiny interface for user to select their files to include
  # if (interactive()) {
  #   
  #   options(shiny.maxRequestSize=400*1024^2) 
  #   
  #   ui <- fluidPage(
  #     titlePanel("Multiple Spectral file read"),
  #     sidebarLayout(
  #       sidebarPanel(
  #         fileInput("SpectralFiles", "Choose Spectra File", accept = c(".mzML",".mzXML","mzml","mzxml","mzData"),
  #                   multiple = TRUE),
  #         
  #       ),
  #       mainPanel(
  #         textOutput("count")
  #       )
  #     )
  #   )
  # 
  #   server <- function(input, output) {
  # 
  #     output$contents <- renderTable({
  #       file <- input$SpectralFiles
  #       ext <- tools::file_ext(file$datapath)
  # 
  #       req(file)
  #       validate(need(ext %in% c(".mzML",".mzXML","mzml","mzxml","mzData"), "Please upload a spectral file !"))
  # 
  #       file;
  #     })
  #     
  #     return(output)
  #   }
  #   
  #   shinyApp(ui, server)
  # }
  if(is.null(mSet)){
    mSet <- new("mSet");
  }
  
  if(.on.public.web()){
    
    if(!dir.exists("upload/")){
      filesList <- list.files("/home/glassfish/projects/MetaboDemoRawData/upload/", recursive = TRUE, full.names = TRUE);
    } else {
      filesList <- list.files("upload/", recursive = TRUE, full.names = TRUE);
    }
    
    filesListBaseNMs <- basename(filesList);
    filesIncluded <- filesList[sapply(filesIncluded, function(x){which(x == filesListBaseNMs)})];
  }
  
  if(!is.null(filesIncluded)){
    
    # file exits check
    fileIdx <- file.exists(filesIncluded);
    if(!any(fileIdx)){
      stop("No valid files provided ! Please check your files or their path !")
    }
    filesIncluded_exited <- filesIncluded[fileIdx];
    filesIncluded_full <- unname(sapply(filesIncluded_exited, tools::file_path_as_absolute));
    
    # file format check
    exts <- tools::file_ext(filesIncluded_full);
    extsIdx <- exts %in% c("mzML","mzXML","mzml","mzxml","mzData", "mzdata", "CDF", "cdf");
    if(!any(extsIdx)){
      stop("No valid format files provided ! Only files with extension of \"mzML\", \"mzml\", \"mzXML\", \"mzxml\", \"mzData\" and \"mzdata\" are supported!")
    }
    filesIncluded_formated <- filesIncluded_full[extsIdx];
    
    # file centroid check
    Centroididx <- unname(sapply(filesIncluded_formated, CentroidCheck));
    if(!any(Centroididx)){
      stop("No centroided spectrum found ! Please Centroid them first !")
    }
    filesIncluded_centroided <- filesIncluded_formated[Centroididx];
    message(c(basename(filesIncluded_centroided), " will be included for further processing !\n"))
    
    # file size check
    fileSizeInfo <- file.size(filesIncluded_centroided)/1024^2;
    largeFileIdx <- fileSizeInfo > 200;
    if(any(largeFileIdx)){
      message(c(basename(filesIncluded_centroided)[largeFileIdx]), " is larger than 200MB, please note your memory !")
    }
    
    filesIncluded <- filesIncluded_centroided;
    
  } else {
    warning("No files will be included for mSet !")
  }
  
  
  if(is.null(filesIncluded)){
    message("No files will be used to update the files inclusion for mSet!")
  }
  
  mSet@rawfiles <- filesIncluded;
  
  if(.on.public.web()){
    save(mSet, file = "mSet.rda");
  }
  
  return(mSet);
}

#' @title CentroidCheck
#' @description Verify the data is centroid or not
#' @param filename single file name, should contain the absolute path
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @importFrom stats quantile
#' @export
#' @return will output a logical value to indicate centroid (TRUE) or not (FALSE)
#' @examples
#' DataFiles <- dir(system.file("mzData", package = "mtbls2"), full.names = TRUE,
#'                  recursive = TRUE)[c(10:11)] 
#' # sapply(DataFiles, CentroidCheck)
#' 

CentroidCheck <- function(filename) {
  # fileh <- MSnbase:::.openMSfile(filename)
  fileh <- mzR::openMSfile(filename, backend = NULL)
  allSpect <- mzR::peaks(fileh, seq_len(50))
  
  nValues <- base::lengths(allSpect, use.names = FALSE) / 2

  mzR::close(fileh)
  rm(fileh)
  
  res <- lapply(
    allSpect,
    FUN = function(z, APPLF, ...) {
      pk <- as.data.frame(z)
      
      k = 0.025
      qtl = 0.9
      .qtl <- quantile(pk[, 2], qtl)
      x <- pk[pk[, 2] > .qtl, 1]
      quantile(diff(x), 0.25) > k
      
    }
  )
  
  res <- unlist(res[!is.na(res)]);
  return(sum(res) > 0.6*length(res))
  
}

#' @title CentroidMSData
#' @description Convert the MS data as centroid
#' @param InFolder single file/folder name
#' @param OutFolder output folder name, if not exits, will create one
#' @param ncore the core number for parallel processing, default is 1
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca} and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
#' @import MSnbase
#' @import BiocParallel
#' @export
#' @return will output a centroid mzML file into the input path
#' @examples
#' InFolder <- system.file("mzData", package = "mtbls2")
#' # CentroidMSData(InFolder) #remove the # befroe your testing


CentroidMSData <- function(InFolder, OutFolder = tempdir(), ncore = 1) {

  if (.Platform$OS.type == "unix") {
    register(bpstart(MulticoreParam(ceiling(ncore))))
  } else if (.Platform$OS.type == "windows") {
    register(bpstart(SnowParam(ceiling(ncore))))
  }
  
  #TODO: to convert the vendor formats with wine
  files.list <-
    dir(
      InFolder,
      pattern = ".mzML|.mzml|.cdf|.mzXML|.mzxml|.mzData|.CDF|.mzdata.xml|.mzData.xml",
      recursive = TRUE,
      full.names = TRUE
    );
  
  if(length(files.list) == 0){
    stop("No MS spectra found, please input a folder containing mzML, mzXML, mzData or CDF file(s) !")
  }
    
  OutPath <- normalizePath(OutFolder);
  
  if(!dir.exists(OutPath)){
    dir.create(OutPath);
    OutPath <- normalizePath(OutFolder);
  }
  
  if(ncore == 1){
    
    res <- sapply(files.list, function(x){
      
      if(!CentroidCheck(x)){
        CMS <- .CentroidSpectra(x);
        fileNM <- sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(x));
        files.mzml <- paste0(OutPath,"/", fileNM, ".mzML");
        writeMSData(CMS, file = files.mzml);
      }})
    
  } else {
    
    res <- bplapply(files.list, function(x){
      
      if(!CentroidCheck(x)){
        CMS <- .CentroidSpectra(x);
        fileNM <- sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(x));
        files.mzml <- paste0(OutPath,"/", fileNM, ".mzML");
        writeMSData(CMS, file = files.mzml);
      }
      
    },
    BPPARAM = MulticoreParam(4))
    
  }
  
  register(bpstop());
  
}

.CentroidSpectra <- function(InputFile){
  
  MS <- read.InMemMSd.data(InputFile, pdata = NULL, msLevel. = 1, centroided. = FALSE, smoothed. = FALSE);
  return(MSnbase::pickPeaks(MS))
  
}



Path2Files <- function(path, centroid = TRUE){
  Pathname <- normalizePath(path);
  
  if (length(Pathname) > 1){
    multipleMS <- TRUE;
  } else {
    multipleMS <- FALSE;
  };
  
  SingleFolder <- SingleFile <- MultiFolder <- MultiFile <- FALSE;
  if(!multipleMS){
    # one character provided!
    if(dir.exists(Pathname)){
      SingleFolder <- TRUE;
    } else if(file.exists(Pathname)) {
      SingleFile <- TRUE;
    } else {
      if(.on.public.web()){
        SingleFolder <- TRUE;
        Pathname <- "/home/glassfish/projects/MetaboDemoRawData/upload"
      } else {
        stop("Invalid path provided, please check!")
      }
    }
  } else {
    # one vector provided!
    if(all(sapply(Pathname, dir.exists))){
      MultiFolder <- TRUE;
    } else if(all(sapply(Pathname, file.exists))){
      MultiFile <- TRUE;
    } else {
      stop("Mixed or Invalid filepath type included, please check!")
    }
    
  }

  pathnames <- unname(unlist(sapply(Pathname, tools::file_path_as_absolute)));
  
  if(SingleFile | MultiFile){
    
    files_tmp <- pathnames;
    
  } else {
    
    files_tmp <-
      dir(
        pathnames,
        pattern = ".mzML|.mzml|.cdf|.mzXML|.mzxml|.mzData|.CDF",
        recursive = TRUE,
        full.names = TRUE
      );
    
  }
  
  files_exts <- tools::file_ext(files_tmp);
  if(length(unique(files_exts)) > 1){
    stop("Multiple data formats are not allowed!")
  };
  
  formatRes <- files_exts %in% c("mzML","mzml","cdf","mzXML","mzxml","mzData","CDF");
  
  if(!all(formatRes)){
    warning("Wrong file format detected, Only mzML, mzXML, mzData or CDF will be included.")
  }
  
  files <- files_tmp[formatRes];
  
  # Centroid check & filter
  # if(!isCdfFile(files) && centroid){
  #   
  #   Centroididx <- unname(sapply(files, CentroidCheck));
  #   
  #   if(!all(Centroididx)){
  #     warning(paste0("Uncentroieded file: ", basename(files[!Centroididx]), "will be filtered!\n"));
  #   }
  #   
  #   files <- files[Centroididx];
  # }

  return(files)
}

#' @references Gatto L, Gibb S, Rainer J (2020). “MSnbase, efficient and elegant R-based processing and visualisation of raw mass spectrometry data.” bioRxiv.
.testReadMSDataInput <- function(e) {
  if (is.numeric(e$msLevel) && !all(e$msLevel > 0))
    stop("msLevel must be an integer > 0.")
  if (length(e$files) < 1)
    stop("At least one MS file is required.")
  if (all(unique(e$files) != e$files))
    stop("Non unique files provided as input. ")
  extensions <- unique(toupper(sub("^.+\\.", "", e$files)))
  if (length(extensions) > 1)
    warning(c("Reading different file formats in. ",
                  "This is untested and you are welcome to try it out. ",
                  "Please report back!", "\n"))
  invisible(TRUE)
}
testCacheArg <- function(cache, maxCache = 2) {
  ## Function used to test the value of a 'cache'
  ## parameter in a read*Data function
  ## Parameters:
  ##  cache: value of the cache argument to test
  ##  maxCache: max value allowed. Generally
  ##            3, but could be less, depending
  ##            on the input data. maxCache is 1
  ##            for readMgfData for instance.
  ## Value: valid (possibly updated) cache value
  if (!is.numeric(cache))
    stop("'cache' must be numeric.")
  if (cache < 0 | cache > maxCache) {
    warning("cache must be [0:", maxCache, "]!")
    if (cache < 0) {
      warning("Setting cache to 0.")
      cache <- 0
    } else {
      warning("Setting cache to ", maxCache, ".")
      cache <- maxCache
    }
  }
  return(cache)
}
setCacheEnv <- function(toCache, level = 0, lock = TRUE) {
  ## Set the .cache slot of a pSet object.
  ## Parameters
  ##  toCache a list with
  ##     "assaydata": environment - pSet assaydata slot
  ##     "hd": header dataframe
  ##  level: numeric - cache level
  ##  lock: logical - lock env and bindings (default is TRUE)
  ## Return:
  ##  A new cache environment
  level <- testCacheArg(level)
  cacheEnv <- new.env(parent = emptyenv())
  assaydata <- toCache[["assaydata"]]
  hd <- toCache[["hd"]]
  assign("level", level, cacheEnv)
  if (level >= 1) { ## levels 2 and 3 not yet implemented
    ## precursor MZ
    precMz <- unname(eapply(assaydata, precursorMz))
    assign("rangePrecursorMz", range(precMz), cacheEnv)
    assign("nPrecursorMz", length(precMz), cacheEnv)
    assign("uPrecursorMz", length(unique(precMz)), cacheEnv)
    ## MS2 MS range
    assign("rangeMz", range(unname(eapply(assaydata, mz))),
           cacheEnv)
    ## MS2 retention time
    Rtime <- unname(eapply(assaydata, rtime))
    assign("rangeRtime", range(Rtime), cacheEnv)
    assign("nRtime", length(Rtime), cacheEnv)
    ## MS levels
    assign("msLevels", unique(unlist(eapply(assaydata, msLevel))),
           cacheEnv)
    ## precursor scans
    assign("nPrecursorScans",
           length(unique(eapply(assaydata, precScanNum))), cacheEnv)
    ## assay data size
    assign("size",
           sum(unlist(unname(eapply(assaydata, object.size)))),
           cacheEnv)
    ## full header
    assign("hd", hd, cacheEnv)
  }
  if (lock)
    lockEnvironment(cacheEnv, bindings = TRUE)
  return(cacheEnv)
}
isCdfFile <- function(x) {
  fileEnds <- c("cdf", "nc")
  ## check for endings and and ending followed by a . (e.g. cdf.gz)
  patts <- paste0("\\.", fileEnds, "($|\\.)")
  res <- sapply(patts, function(z) {
    grep(z, x, ignore.case = TRUE)
  })
  return(any(unlist(res)))
}
formatFileSpectrumNames <- function(fileIds, spectrumIds,
                                    nFiles=length(fileIds),
                                    nSpectra=length(spectrumIds)) {
  digits <- ceiling(log10(c(nFiles, nSpectra) + 1L))
  
  if (length(fileIds) != 1L && length(spectrumIds) != length(fileIds)) {
    stop("Length of 'fileIds' has to be one or equal to ",
         "the length of 'spectrumIds'.")
  }
  
  sprintf(paste0("F%0", digits[1L], "d.S%0", digits[2L], "d"),
          fileIds, spectrumIds)
}

SanitySpectra <- function(rawData, cutoff = 0.15){
  if(length(rawData) != 0){
    
    fileList <- unique(rawData@featureData@data[["fileIdx"]]);
    exIdx = NULL
    
    res <- sapply(fileList, function(x){
      
      otherMean <- mean(rawData@featureData@data[["basePeakIntensity"]][rawData@featureData@data[["fileIdx"]] != x]);
      thisFile <- mean(rawData@featureData@data[["basePeakIntensity"]][rawData@featureData@data[["fileIdx"]] == x]);
      return(thisFile/otherMean)
      
    })
    
    removingFile <- rawData@phenoData@data[["sample_name"]][res < cutoff];
    if(length(removingFile) > 0 && any(!is.na(removingFile))){
      MessageOutput(paste0("Base Peak Intensity of ", removingFile, " is lower than ", 
                           cutoff*100, "% of average, will be excluded!\n"));
    }
    
    exIdx <- which(res < cutoff);
    
    SpectraClean(rawData, exIdx)
    
  } else {
    return(rawData)
  }
  
}

SpectraClean <- function(rawData, exIdx = NULL){
  
  if(!is.null(exIdx) & length(exIdx) > 0){
    
    rawData@phenoData@data <- rawData@phenoData@data[-exIdx,];
    
    fileExIdx <- (rawData@featureData@data[["fileIdx"]] %in% exIdx);
    rawData@featureData@data <- rawData@featureData@data[!fileExIdx,];
    
    filesArray <- unique(rawData@featureData@data[["fileIdx"]]);
    
    for (i in seq(filesArray)){
      rawData@featureData@data[["fileIdx"]][rawData@featureData@data[["fileIdx"]] == filesArray[i]] <- i
    }
    
    rawData@experimentData@instrumentModel <- rawData@experimentData@instrumentModel[-exIdx];
    rawData@experimentData@instrumentManufacturer <- rawData@experimentData@instrumentManufacturer[-exIdx];
    rawData@experimentData@ionSource <- rawData@experimentData@ionSource[-exIdx];
    rawData@experimentData@analyser <- rawData@experimentData@analyser[-exIdx];
    rawData@experimentData@detectorType <- rawData@experimentData@detectorType[-exIdx];
    rawData@processingData@files <- rawData@processingData@files[-exIdx];
    
  }
  
  return(rawData)
  
}

DetectMS2Design <- function(msfile = "NA"){
  
  if(msfile == "NA"){
    stop("MS data file is missing!")
  } else if(!file.exists(msfile)){
    stop("MS data file does not exist!")
  }
  msdata <- mzR::openMSfile(msfile, backend = NULL)
  fullhd <- mzR::header(msdata)
  
  targetmzs <- fullhd$isolationWindowTargetMZ
  mzsloffst <- fullhd$isolationWindowLowerOffset
  mzsupffst <- fullhd$isolationWindowUpperOffset
  
  if(all(is.na(mzsloffst)) | all(is.na(mzsupffst)) | all(is.na(targetmzs))){
    ms1idx <- which(fullhd$msLevel == 1)
    
    lgnms1 <- ms1idx[1:2]
    lgnvec<- length(c(lgnms1[1]:lgnms1[2]))

    design_df2 <- design_df <- 
      data.frame(mz_low = double(length = lgnvec-2),
                 mz_up = double(length = lgnvec-2))

    
    gc()
    mzR::close(msdata)
    rm(msdata);
    return(design_df)
  }
  
  idx_set <- !is.na(targetmzs)
  targetmzs <- targetmzs[idx_set]
  mzsloffst <- mzsloffst[idx_set]
  mzsupffst <- mzsupffst[idx_set]
  
  mz1st <- targetmzs[!is.na(targetmzs)][1]
  
  idx_lst <- which(mz1st == targetmzs)[2]-1
  design_df0 <- design_df2 <- design_df <- 
    data.frame(mz_low = double(length = idx_lst),
               mz_up = double(length = idx_lst))
  
  for(i in 1:idx_lst){
    design_df[i, ]<- c((targetmzs[i] - mzsloffst[i]), 
                       (targetmzs[i] + mzsupffst[i]))
  }
  
  for(j in (idx_lst+1):(idx_lst*2)){
    design_df2[j-idx_lst, ]<- c((targetmzs[j] - mzsloffst[j]), 
                        (targetmzs[j] + mzsupffst[j]))
  }
  
  gc()
  mzR::close(msdata)
  rm(msdata);
  
  if(all(design_df == design_df2)){
    return(design_df)
  } else {
    return(design_df0)
  }
}
xia-lab/OptiLCMS documentation built on March 27, 2024, 11:11 a.m.