R/getCVreplicates.R

#' getCVreplicates
#' 
#' Evaluate feature consistency based on coefficient of variation, where
#' cv=sd/mean.  Only calculate CV for samples for which a signal is detected in
#' at least 60% of technical replicates.  If signal is detected in 60% of
#' technical replicates, then the missing values are replaced by mean intensity
#' which is calculated using non-missing replicates.
#' 
#' 
#' @param curdata Feature alignment output matrix from apLCMS or XCMS with
#' sample intensities
#' @param alignment.tool Name of the feature alignment tool eg: "apLCMS" or
#' "XCMS"
#' @param numreplicates Number of replicates per sample
#' @param min.samp.percent If signal is detected in x proportion of technical
#' replicates, then the missing values are replaced by mean intensity which is
#' calculated using non-missing replicates. eg: 0.7
#' @param impute.bool Should the missingvalues be replaced by mean of the other
#' replicates? eg: TRUE or FALSE
#' @param missingvalue How are the missing values represented? eg: 0 or NA
#' @return Matrix of feature consistency based on CV with columns: mz: m/z of
#' the feature minCV: minimum CV between technical replicates across all
#' samples first_quartileCV: 25th percentile CV medianCV: 50th percentile CV
#' meanCV: average of cofficient of variations between technical replicates per
#' sample across all samples third_quartileCV: 75th percentile CV maxCV:
#' maximum CV between technical replicates across all samples
#' @author Karan Uppal <kuppal2@@emory.edu>
#' @keywords ~CV
getCVreplicates <- function(curdata, alignment.tool, 
    numreplicates, min.samp.percent = 0.6, impute.bool = TRUE, 
    missingvalue = 0) {
    mean_replicate_difference <- {
    }
    sd_range_duplicate_pairs <- {
    }
    
    if (alignment.tool == "apLCMS") {
        col_end = 4
    } else {
        if (alignment.tool == "XCMS") {
            col_end = 8
        } else {
            col_end = 2
            print("**Using the first two columns as mz and retention time for PID calculation**")
            # stop(paste('Invalid value for alignment.tool.
            # Please use either \'apLCMS\' or \'XCMS\'',
            # sep=''))
        }
    }
    
    curdata_mz_rt_info = curdata[, c(1:col_end)]
    curdata = curdata[, -c(1:col_end)]
    
    # curdata<-curdata[1:10,]
    numfeats = dim(curdata)[1]
    numsamp = dim(curdata)[2]
    rnames <- colnames(curdata)
    rnames <- gsub(".cdf", "", rnames, ignore.case = TRUE)
    quantcolnames = c("min", "first_quartile", "median", 
        "mean", "third_quartile", "max", "sampleCount")
    if (impute.bool == TRUE) {
        min.samp.percent = min.samp.percent
    } else {
        min.samp.percent = 1
    }
    
    
    newrow = {
    }
    finalmat = {
    }
    
    
    cl <- makeSOCKcluster(5)
    
    
    
    clusterExport(cl, "getCVreplicates.child")
    # clusterExport(cl, 'numsamp') clusterExport(cl,
    # 'numreplicates') clusterExport(cl,
    # 'min.samp.percent') clusterExport(cl,
    # 'impute.bool') clusterExport(cl,
    # 'alignment.tool')
    
    cv.res <- parApply(cl, curdata, 1, getCVreplicates.child, 
        numsamp = numsamp, numreplicates = numreplicates, 
        min.samp.percent = min.samp.percent, impute.bool = impute.bool, 
        missingvalue = missingvalue)
    
    dim(cv.res) = dim(matrix(nrow = 7, ncol = numfeats))
    # cv.res<-t(cv.res)
    
    
    stopCluster(cl)
    final_set <- as.data.frame(cv.res)
    rownames(final_set) = NULL
    final_set <- apply(final_set, 2, as.numeric)
    final_set <- as.data.frame(t(final_set))
    
    numsamp = numsamp/numreplicates
    colnames(final_set) <- c("min", "first_quartile", 
        "median", "mean", "third_quartile", "max", 
        "numgoodsamples")
    
    # deltappm_res<-apply(curdata_mz_rt_info,1,get_deltappm)
    # delta_cv_range<-as.numeric(final_set$max)-as.numeric(final_set$min)+0.1
    
    # Qscore<-100*((final_set$numgoodsamples)/(delta_cv_range*numsamp*(deltappm_res+0.1)))
    
    
    # Qscore<-100*(final_set$numgoodsamples/(final_set$median*numsamp))
    
    mz_min_max <- cbind(curdata_mz_rt_info[, 1], curdata_mz_rt_info[, 
        1])
    if (alignment.tool == "apLCMS") {
        mz_min_max <- cbind(curdata_mz_rt_info[, 3], 
            curdata_mz_rt_info[, 4])
    } else {
        if (alignment.tool == "XCMS") {
            mz_min_max <- cbind(curdata_mz_rt_info[, 
                2], curdata_mz_rt_info[, 3])
        }
    }
    mz_min_max <- as.data.frame(mz_min_max)
    
    deltappm_res <- apply(mz_min_max, 1, get_deltappm)
    delta_cv_range <- 1  #as.numeric(final_set$max)-as.numeric(final_set$min)+0.1
    
    
    # Qscore<-100*(final_set$numgoodsamples)/((((as.numeric(final_set$median)+1)*numsamp*((10*(deltappm_res+0.1))))+1))
    
    if (is.na(missingvalue) == TRUE) {
        num_tot_nonzeros <- apply(curdata, 1, function(x) {
            length(which(is.na(x) == FALSE))
        })
        
        
    } else {
        
        num_tot_nonzeros <- apply(curdata, 1, function(x) {
            length(which(x > missingvalue))
        })
        
    }
    
    # Qscore<-100*((final_set$numgoodsamples)/(delta_cv_range*as.numeric(final_set$median)*num_tot_zeros*(deltappm_res+0.1)))
    
    # Qscore<-100*((final_set$numgoodsamples)/((as.numeric(final_set$median)*num_tot_zeros*(deltappm_res+0.1))+1)
    
    # Qscore<-100*((final_set$numgoodsamples)/(((as.numeric(final_set$median)+1)*(num_tot_zeros+0.1)*(deltappm_res+0.1))+1))
    
    # Qscore<-100*(final_set$numgoodsamples)/((((as.numeric(final_set$median)+1)*numsamp*(((deltappm_res+0.1))))+1))
    
    # Qscore<-(100*final_set$numgoodsamples)/((((as.numeric(final_set$median)+0.01)*numsamp*(((deltappm_res+0.01))))+1))
    
    # Qscore<-(100*final_set$numgoodsamples)/((as.numeric(final_set$median)+0.01)*num_tot_nonzeros*0.5*(deltappm_res+0.01))
    
    # part of xMSanalyzer v2.0.4
    Qscore <- (100 * final_set$numgoodsamples)/(((as.numeric(final_set$median) + 
        0.01 + 1) * num_tot_nonzeros * (1/numreplicates) * 
        (deltappm_res + 0.01 + 1)) + 0.01)
    
    termA <- final_set$median + 0.01 + 1
    termB <- num_tot_nonzeros * (1/numreplicates)
    # Qscore<-(100*final_set$numgoodsamples)/(((max(as.numeric(final_set$median),as.numeric(final_set$max),na.rm=TRUE)+0.01+1)*num_tot_nonzeros*(1/numreplicates))+0.01)
    
    # part of xMSanalyzer_v2.0.6
    Qscore <- (100 * final_set$numgoodsamples)/((termA * 
        termB) + 0.01)
    
    
    
    final_set <- cbind(final_set, Qscore)
    final_set <- as.data.frame(final_set)
    
    # quantcolnames
    return(final_set)
}
BowenNCSU/xMSanalyzer documentation built on May 24, 2019, 7:36 a.m.