R/doProcessing.R

Defines functions setPPMbounds setLogFile detectCores

Documented in detectCores setLogFile setPPMbounds

# ID doProcessing.R
# Copyright (C) 2017-2019 INRA
# Authors: D. Jacob
#

#' detectCores
#'
#' \code{detectCores} is simply a shortcut for parallel::detectCores().
#'
#' @param ...  See \code{parallel::detectCores}
detectCores <- function(...) {
   parallel::detectCores(...)
}

#' setLogFile
#'
#' \code{setLogFile} allows to redirect all log messages to a file
#'
#' @param con  a connection object which inherits from class "connection" 
#' @examples
#'  \donttest{
#'   # Redirect all log messages to a temporary file
#'    outtmp <- tempfile()
#'    con <- file(outtmp, "wt", encoding = "UTF-8")
#'    setLogFile(con)
#'    data_dir <- system.file("extra", package = "Rnmr1D")
#'    RAWDIR <- file.path(data_dir, "CD_BBI_16P02")
#'    CMDFILE <- file.path(data_dir, "NP_macro_cmd.txt")
#'    SAMPLEFILE <- file.path(data_dir, "Samples.txt")
#'    out <- Rnmr1D::doProcessing(RAWDIR, cmdfile=CMDFILE, samplefile=SAMPLEFILE, ncpu=2)
#'    close(con)
#'    readLines(outtmp)
#'  }
setLogFile <- function(con=stdout())
{
   if ("connection" %in% class(con) ) globvars$LOGFILE <- con
}

#' setPPMbounds
#'
#' Set the PPM bounds for proton (1H) and carbon (13C) to consider in the processing step and then to store in the specMat object
#'
#' @param proton Minimal and Maximal ppm value for 1H NMR
#' @param carbon Minimal and Maximal ppm value for 13C NMR
setPPMbounds <- function(proton=c(-0.5,11), carbon=c(0,200))
{
   globvars$PPM_MIN <-  proton[1]
   globvars$PPM_MAX <-  proton[2]
   globvars$PPM_MIN_13C <- carbon[1]
   globvars$PPM_MAX_13C <- carbon[2]
}

#' doProcessing 
#'
#' \code{doProcessing} is the main function of this package. Indeed, this function performs 
#' the complete processing of a set of 1D NMR spectra from the FID (raw data) and based on a 
#' processing sequence (macro-command file). An additional file specifies all the spectra to 
#' be considered by associating their sample code as well as the levels of experimental 
#' factors to which they belong. In this way it is possible to select only a subset of spectra 
#' instead of the whole set. 
#'
#' @param path  The full path of either the raw spectra directory on the disk
#' @param cmdfile The full path name of the Macro-commands file for processing (text format)
#' @param samplefile The full path name of the Sample file (tabular format)
#' @param bucketfile The full path name of the file of bucket's zones (tabular format)
#' @param phcfile The full path name of the phasing file for samples if required (tabular format)
#' @param ncpu The number of cores [default: 1]
#' @return
#' \code{doProcessing} returns a list containing the following components:
#' \itemize{
#'   \item \code{samples} : the samples matrix with the correspondence of the raw spectra, 
#' as well as the levels of the experimental factors if specified in the input.
#'   \item \code{factors} : the factors matrix with the corresponding factor names. 
#' At minimum, the list contains the Samplecode label corresponding to the samples without their 
#' group level.
#'   \item \code{rawids} : list of the full directories of the raw spectra (i.e. where the FID files 
#' are accessible)
#'   \item \code{infos} : list of the acquisition and processing parameters for each (raw) spectra.
#'   \item \code{specMat} : objects list  regarding the spectra data.
#'       \itemize{
#'             \item \code{int} : the matrix of the spectra data (\code{nspec} rows X \code{size} 
#' columns)
#'             \item \code{nspec} : the number of spectra
#'             \item \code{size} : the size (i.e number of points) of each spectra
#'             \item \code{ppm_min}, \code{ppm_max} : the minimum and the maximum ppm values of 
#' spectra
#'             \item \code{ppm} : the vector of the ppm values (\code{size} values)
#'             \item \code{dppm} : the ppm increment between each point
#'             \item \code{buckets_zones} : the matrix of the buckets zones including two columns 
#' (min and max) 
#'             \item \code{namesASintMax} : boolean - If TRUE, generate all output matrix with 
#' bucket names based on ppm values of the maximum of the average intensity of all spectra within
#' the ppm range of each bucket. If FALSE (default), then bucket names will be based on the ppm 
#' range center of each bucket.
#'         }
#' }
#' @examples
#'  \donttest{
#'     data_dir <- system.file("extra", package = "Rnmr1D")
#'     cmdfile <- file.path(data_dir, "NP_macro_cmd.txt")
#'     samplefile <- file.path(data_dir, "Samples.txt")
#'     out <- Rnmr1D::doProcessing(data_dir, cmdfile=cmdfile, 
#'                                 samplefile=samplefile, ncpu=2)
#' }
#' @seealso the NMRProcFlow online documentation \url{https://nmrprocflow.org/} and especially 
#' the Macro-command Reference Guide (\url{https://nmrprocflow.org/themes/pdf/Macrocommand.pdf})
#'
doProcessing <- function (path, cmdfile, samplefile=NULL, bucketfile=NULL, phcfile=NULL, ncpu=1 )
{
   if( ! file.exists(path))
       stop(paste0("ERROR: ",path," does NOT exist\n"), call.=FALSE)
   if( ! file.exists(cmdfile))
       stop(paste0("ERROR: ",cmdfile," does NOT exist\n"), call.=FALSE)
   if( ! is.null(samplefile) && ! file.exists(samplefile))
       stop(paste0("ERROR: ",samplefile," does NOT exist\n"), call.=FALSE)
   if( ! is.null(bucketfile) && ! file.exists(bucketfile))
       stop(paste0("ERROR: ",bucketfile," does NOT exist\n"), call.=FALSE)
   if ( checkMacroCmdFile(cmdfile) == 0 )
       stop(paste0("ERROR: ",cmdfile," seems to include errors\n"), call.=FALSE)

   trim <- function (x) gsub("^\\s+|\\s+$", "", x)
   LOGFILE <- globvars$LOGFILE

   Write.LOG(LOGFILE, "Rnmr1D:  --- READING and CONVERTING ---\n")

   # Initialize the list of processing parameters
   procParams <- Spec1rProcpar
   procParams$VENDOR <- "bruker"
   procParams$INPUT_SIGNAL <- "1r"
   procParams$READ_RAW_ONLY <- TRUE
   procParams$PHCFILE <- FALSE

   # Rnmr1D macrocommand file: Get the preprocessing parameter line if exists
   CMDTEXT <- gsub("\t", "", readLines(cmdfile))
   if ( length(grep("#%%", CMDTEXT[1]))==1 ) {
        procpar <- unlist(strsplit(gsub("#%% ", "", CMDTEXT[1]), "; "))
        Write.LOG(LOGFILE, paste0( "Rnmr1D:  ", paste(procpar,collapse=", "), "\n"))
        parnames <- NULL; parvals <- NULL
        for (param in procpar ) { parnames <- c( parnames, unlist(strsplit(param,"="))[1] ); parvals <- c( parvals, unlist(strsplit(param,"="))[2] ); }
        names(parvals) <- parnames;  procpar <- data.frame(t(parvals), stringsAsFactors=FALSE)
        if (! is.null(procpar$Vendor)) procParams$VENDOR <- tolower(trim(procpar$Vendor))
        if (! is.null(procpar$Type)) procParams$INPUT_SIGNAL <- trim(procpar$Type)
        if (! is.null(procpar$LB)) procParams$LB <- as.numeric(procpar$LB)
        if (! is.null(procpar$GB)) procParams$GB <- as.numeric(procpar$GB)
        if (! is.null(procpar$ZNEG)) procParams$RABOT <- ifelse( procpar$ZNEG=="TRUE", TRUE, FALSE)
        if (! is.null(procpar$TSP)) procParams$TSP <- ifelse( procpar$TSP=="TRUE", TRUE, FALSE)
        if (! is.null(procpar$O1RATIO)) procParams$O1RATIO <- as.numeric(procpar$O1RATIO)
        if (! is.null(procpar$ZF)) procParams$ZEROFILLING <- ifelse (as.numeric(procpar$ZF)>0, TRUE, FALSE)
        if (! is.null(procpar$ZF)) procParams$ZFFAC <- as.numeric(procpar$ZF)
        if (! is.null(procpar$FILEPHC) && procpar$FILEPHC=="TRUE" && !is.null(phcfile)) {
            procpar$USRPHC <- "TRUE"; procpar$PHCFILE <- TRUE
        }
        if (! is.null(procpar$USRPHC) && procpar$USRPHC=="TRUE") {
           procParams$OPTPHC0 <- procParams$OPTPHC1 <- FALSE
           if (procpar$PHCFILE) {
               if (! is.null(procpar$PHC0)) procParams$phc0 <- 0
               if (! is.null(procpar$PHC1)) procParams$phc1 <- 0
           } else {
               if (! is.null(procpar$PHC0)) procParams$phc0 <- as.numeric(procpar$PHC0)
               if (! is.null(procpar$PHC1)) procParams$phc1 <- as.numeric(procpar$PHC1)
           }
        } else {
           if (! is.null(procpar$PHC1)) procParams$OPTPHC0 <- ifelse( procpar$PHC1=="TRUE", FALSE, TRUE)
           if (! is.null(procpar$PHC1)) procParams$OPTPHC1 <- ifelse( procpar$PHC1=="TRUE", TRUE, FALSE)
           if (! is.null(procpar$CRIT1)) procParams$CRITSTEP1 <- as.numeric(procpar$CRIT1)
           # additional parameters with default values
           #procParams$BLPHC <- 50
           #procParams$KSIG <- 2
           #procParams$GAMMA <- 0.005
           #procParams$RATIOPOSNEGMIN <- 0.45
           #procParams$CLEANUP_OUTPUT <- TRUE
           #procParams$LINEBROADENING <- TRUE
           #procParams$O1RATIO <- 1
           #procParams$REMLFREQ <- 0
           #procParams$CPMG <- FALSE
           #procParams$KZERO <- 0.3
           #procParams$OPTSTEP <- TRUE
           #procParams$OPTCRIT0 <- 0
           #procParams$OPTCRIT1 <- 2
           #procParams$CRITSTEP2 <- 2
           #procParams$JGD_INNER <- TRUE
           #procParams$RMS <- 0
           #procParams$OC <- FALSE
        }
        if (procParams$INPUT_SIGNAL=='fid') procParams$READ_RAW_ONLY <- FALSE
        procParams$REVPPM <- ifelse(procParams$VENDOR == 'varian' || procParams$VENDOR == 'jeol', TRUE, FALSE)
   }

   # Generate the 'samples.csv' & 'factors' files from the list of raw spectra
   Write.LOG(LOGFILE, "Rnmr1D:  Generate the 'samples' & 'factors' files from the list of raw spectra\n")

   metadata <- NULL
   samples <- NULL
   if (!is.null(samplefile) && file.exists(samplefile))
      samples <- utils::read.table(samplefile, sep="\t", header=T,stringsAsFactors=FALSE)

   # Read the phasing file for samples if specified
   if (!is.null(phcfile) && procpar$PHCFILE)
       PHC <- utils::read.table(phcfile, sep="\t", header=T, stringsAsFactors=F)

   metadata <- generateMetadata(path, procParams, samples)

   # If ERROR occurs ...
   if (is.null(metadata)) {
       msg <- "Something failed when attempting to generate the metadata files"
       stop(paste0(msg,"\n"), call.=FALSE)
   }
   gc()

   LIST <- metadata$rawids
   Write.LOG(LOGFILE, paste0("Rnmr1D:  -- Nb Spectra = ",dim(LIST)[1]," -- Nb Cores = ",ncpu,"\n"))

   specObj <- NULL
   tryCatch({

       cl <- parallel::makeCluster(ncpu)
       doParallel::registerDoParallel(cl)
       Sys.sleep(1)

       x <- 0
       specList <- foreach::foreach(x=1:(dim(LIST)[1]), .combine=cbind) %dopar% {
            ACQDIR <- LIST[x,1]
            NAMEDIR <- ifelse( procParams$VENDOR=='bruker', basename(dirname(ACQDIR)), basename(ACQDIR) )
            PDATA_DIR <- ifelse( procParams$VENDOR=='rs2d', 'Proc', 'pdata' )
            if (procParams$INPUT_SIGNAL=='fid' && procParams$PHCFILE) {
                n <- which(PHC[,1]==NAMEDIR)
                procParams$phc0 <- as.numeric(PHC[n,2])*pi/180
                procParams$phc1 <- as.numeric(PHC[n,3])*pi/180
                procParams$OPTPHC0 <- procParams$OPTPHC1 <- FALSE
            }
            # Init the log filename
            procParams$LOGFILE <- globvars$LOGFILE
            procParams$PDATA_DIR <- file.path(PDATA_DIR,LIST[x,3])
            spec <- Spec1rDoProc(Input=ACQDIR,param=procParams)
            if (procParams$INPUT_SIGNAL=='1r') Sys.sleep(0.3)
            Write.LOG(stderr(),".")
            if (dim(LIST)[1]>1) {
                list( x, spec )
            } else {
                spec
            }
       }
       Write.LOG(LOGFILE,"\n")
       gc()

       parallel::stopCluster(cl)

       if (dim(LIST)[1]>1) {
          # Ensure that the specList array is in the same order than both  samples and IDS arrays
          L <- simplify2array(sapply( order(simplify2array(specList[1,])), function(x) { specList[2,x] } ) )
          specList <- L
       }

       # Get all spectra that are correcly processed
       N <- dim(LIST)[1]
       idsOK <- NULL
       PPM_MIN <- -1000
       PPM_MAX <- 1000
       for(i in 1:N) {
          if (N>1) { spec <- specList[,i]; } else { spec <- specList; }
          if (! is.null(spec$acq) ) {
              idsOK <- c( idsOK, i )
              PPM_MIN <- max(PPM_MIN, spec$pmin)
              PPM_MAX <- min(PPM_MAX, spec$pmax)
          }
       }
       Write.LOG(LOGFILE, paste0('Rnmr1D:  PPM range = [',round(PPM_MIN,4)," , ",round(PPM_MAX,4),"]\n"))
       Write.LOG(LOGFILE,"\n")

       if (length(idsOK)<N) {
          specList <- specList[,idsOK]
          LIST <- LIST[idsOK, ]
          metadata$samples <- metadata$samples[idsOK, ]
          metadata$rawids <- metadata$rawids[idsOK, ]
          Write.LOG(LOGFILE, paste0("Rnmr1D:  ", N-length(idsOK)," ERRORS FOUND! \n"))
       }

       Write.LOG(LOGFILE, "Rnmr1D:  Generate the final matrix of spectra...\n")

       M <- NULL
       N <- dim(LIST)[1]
       vpmin<-0; vpmax<-0

       for(i in 1:N) {
           if (N>1) { spec <- specList[,i]; } else { spec <- specList; }
           #PPM_MIN <- globvars$PPM_MIN; PPM_MAX <- globvars$PPM_MAX;
           #if (spec$acq$NUC == "13C") { PPM_MIN <- globvars$PPM_MIN_13C; PPM_MAX <- globvars$PPM_MAX_13C; }
           P <- spec$ppm>PPM_MIN & spec$ppm<=PPM_MAX
           V <- spec$int[P]
           vppm <- spec$ppm[P]
           if (PPM_MIN<spec$pmin) {
               nbzeros <- round((spec$pmin - PPM_MIN)/spec$dppm)
               vpmin <- vpmin + spec$pmin - nbzeros*spec$dppm
               V <- c( rep(0,nbzeros), V )
           } else {
               vpmin <- vpmin + vppm[1]
           }
           if (PPM_MAX>spec$pmax) {
               nbzeros <- round((PPM_MAX - spec$pmax)/spec$dppm)
               vpmax <- vpmax + spec$pmax + nbzeros*spec$dppm
               V <- c( V, rep(0,nbzeros) )
           } else {
               vpmax <- vpmax + vppm[length(vppm)]
           }
           M <- rbind(M, rev(V))
       }

       cur_dir <- getwd()

       specMat <- NULL
       specMat$int <- M
       specMat$ppm_max <- (vpmax/N);
       specMat$ppm_min <- (vpmin/N);
       specMat$nspec <- dim(M)[1]
       specMat$size <- dim(M)[2]
       specMat$dppm <- (specMat$ppm_max - specMat$ppm_min)/(specMat$size - 1)
       specMat$ppm <- rev(seq(from=specMat$ppm_min, to=specMat$ppm_max, by=specMat$dppm))
       specMat$buckets_zones <- NULL
       specMat$namesASintMax <- FALSE  # FALSE -> center, TRUE -> intMax
       specMat$fWriteSpec <- FALSE

       specObj <- metadata
       specObj$procParams <- procParams
       specObj$specMat <- specMat

       samples <- metadata$samples

       # Raw IDs : expno & procno 
       IDS <- cbind(basename(dirname(as.vector(LIST[,1]))), LIST[, c(2:3)])
       if (N>1) {
             PARS <- t(sapply( c(1:N), function(x) {
                       c( specList[,x]$acq$PULSE, specList[,x]$acq$NUC,   specList[,x]$acq$SOLVENT,    specList[,x]$acq$GRPDLY, 
                          specList[,x]$proc$phc0, specList[,x]$proc$phc1, specList[,x]$acq$SFO1,       specList[,x]$proc$SI, 
                          specList[,x]$acq$SW,    specList[,x]$acq$SWH,   specList[,x]$acq$RELAXDELAY, specList[,x]$acq$O1 )
             }))
             specObj$nuc <- specList[,1]$acq$NUC
       } else {
             PARS <- t( c( spec$acq$PULSE, spec$acq$NUC, spec$acq$SOLVENT, spec$acq$GRPDLY, spec$proc$phc0,      spec$proc$phc1,
                           spec$acq$SFO1,  spec$proc$SI, specList$acq$SW,  spec$acq$SWH,    spec$acq$RELAXDELAY, spec$acq$O1) )
             specObj$nuc <- spec$acq$NUC
       }
       LABELS <- c("PULSE", "NUC", "SOLVENT", "GRPDLY", "PHC0","PHC1","SF","SI","SW", "SWH","RELAXDELAY","O1" )
       
       if (regexpr('BRUKER', toupper(specList[,1]$acq$INSTRUMENT))>0) {
          if (N>1) { PARS <- cbind( IDS[,c(2:3)], PARS ) } else { PARS <- c( IDS[1,c(2:3)], PARS ) }
          LABELS <- c("EXPNO", "PROCNO", LABELS)
       }
       if (regexpr('RS2D', toupper(specList[,1]$acq$INSTRUMENT))>0) {
          if (N>1) { PARS <- cbind( IDS[,3], PARS ) } else { PARS <- c( IDS[1,3], PARS ) }
          LABELS <- c("PROCNO", LABELS)
       }
       if (N>1) {
          PARS <- cbind( samples[,1], samples[,2], PARS )
       } else {
          PARS <- c( samples[1, 1], samples[1, 2], PARS )
       }
       colnames(PARS) <- c("Spectrum", "Samplecode", LABELS )
       specObj$infos <- PARS
       specObj$origin <- paste(procParams$VENDOR, procParams$INPUT_SIGNAL)

     # Rnmr1D processing macrocommand file
       Write.LOG(LOGFILE,"Rnmr1D: ------------------------------------\n")
       Write.LOG(LOGFILE,"Rnmr1D: Process the Macro-commands file\n")
       Write.LOG(LOGFILE,"Rnmr1D: ------------------------------------\n")
       Write.LOG(LOGFILE,"Rnmr1D: \n")

     # Process the Macro-commands file
       specMat <- doProcCmd(specObj, CMDTEXT, ncpu=ncpu, debug=TRUE)
       if (specMat$fWriteSpec) specObj$specMat <- specMat
       gc()

     # Performs the bucketing based on the bucket list file
       if( ! is.null(bucketfile) && file.exists(bucketfile)) {
           Write.LOG(LOGFILE, "Rnmr1D: ------------------------------------\n")
           Write.LOG(LOGFILE, "Rnmr1D: Process the file of buckets\n")
           Write.LOG(LOGFILE, "Rnmr1D: ------------------------------------\n")
           Write.LOG(LOGFILE, "Rnmr1D: \n")

           # The bucket zones file
           buckets_infile <- utils::read.table(bucketfile, header=T, sep="\t",stringsAsFactors=FALSE)
           if ( sum(c('min','max') %in% colnames(buckets_infile)) == 2 ) {
                buckets <- cbind( buckets_infile$max, buckets_infile$min )
                specObj$specMat$buckets_zones <- buckets
                Write.LOG(LOGFILE, paste0("Rnmr1D:     NB Buckets = ",dim(buckets)[1],"\n"))
                Write.LOG(LOGFILE, "Rnmr1D: \n")
           } else {
                Write.LOG(LOGFILE,"ERROR: the file of bucket's areas does not contain the 2 mandatory columns having 'min' and 'max' in its header line\n")
           }
       }

       specObj$specMat$fWriteSpec <- NULL
       specObj$specMat$LOGMSG <- NULL

   }, error=function(e) {
       cat(paste0("ERROR: ",e))
   })

   return(specObj)

}
INRA/Rnmr1D documentation built on April 11, 2024, 1:29 a.m.