#' shiftCor provide the QC-RLS correction for large scale metabolomics.
#' @param samPeno a file with the meta information.
#' @param samFile a file with the expression information.
#' @param Frule The cut-off value for missing value filter function.
#' @param QCspan The smoothing parameter which controls the bias-variance tradeoff. if the QCspan is set at '0', the generalised cross-validation will be performed to avoid overfitting the observed data.
#' @param degree Lets you specify local constant regression (i.e., the Nadaraya-Watson estimator, degree=0), local linear regression (degree=1), or local polynomial fits (degree=2, the default).
#' @param imputeM The parameter for imputation method.(i.e., nearest neighbor averaging, "KNN"; minimum values for imputed variables, "min", median values for imputed variables (Group dependent) "median").
#' examples
#' library(statTarget)
#' datpath <- system.file("extdata",package = "statTarget")
#' samPeno <- paste(datpath,"MTBLS79_sampleList.csv", sep="/")
#' samFile <- paste(datpath,"MTBLS79.csv", sep="/")
#' shiftCor(samPeno,samFile,Frule = 0.8,QCspan = 0.75, degree = 2,imputeM = "KNN")
#' export
#' @import statTarget
shiftCor <- function(samPeno,samFile,Frule = 0.8,QCspan = 0.75, degree = 2,imputeM = "KNN"){
#imputeM = "median","KNN" or "min"
#degree = "0","1","2"
#samPeno <- read.csv(samPeno, header=TRUE, check.names = F,stringsAsFactors = F)
samPeno <- as.data.frame(samPeno)
#samFile <- read.csv(samFile,header=FALSE, check.names = F,stringsAsFactors = F)
samFile <- t(samFile)
#colnames(samFile) <- samFile[1,]
samFile <- as.data.frame(samFile[-1,])
#rownames(samFile) <- samFile$name
samFP <- samFile
#if(sum(is.na(samPeno$class)) <=0 ){
#message(date(),"There were not QC sample in your data!")
#return(para)
#}
#################LOESS data NA########
samFP <- as.matrix(samFP)
samFP[samFP<=0] <- NA
#cat(date(), "\nstatTarget: shiftCor start... \nEvaluation of missing value...")
#message(date(),"The number of NA value in Data Profile before QC-RLSC: ",
#sum(is.na(samFP) | as.matrix(samFP) <= 0))
imsamFP <- samFP
#############Filter miss value###################
FilterMV = function(m,degre) {
dx <- c()
for(i in 1:ncol(m)){
freq <- as.vector(tapply(m[,i], degre, function(x){sum(is.na(x) | as.matrix(x) <= 0)/length(x)}))
if(sum(freq > Frule) > 0) dx <- c(dx , i)
}
if(length(dx) >0) m <- m[,-dx]
return(m)
}
classF <- as.factor(samPeno$class)
classF = addNA(classF)
imsamFPF = FilterMV(imsamFP,classF)
#Frule_warning= paste("The number of vaiables including", Frule*100, "% of missing value :",sep = " ")
#message(date(),Frule_warning," ", dim(imsamFP)[2]-dim(imsamFPF)[2])
imsamFP = imsamFPF
##############impute missing value#################
#cat(date(), "\nImputation start...\n")
if(imputeM == "KNN"){
#require(impute)
mvd <- impute::impute.knn(imsamFP[,1:ncol(imsamFP)])
inputedData <- mvd$data
}else if(imputeM == "min"){
inputedData <- apply(imsamFP[,2:ncol(imsamFP)],2,function(y){
y[is.na(y) | y<=0] <- min(y[y>0],na.rm = TRUE)
y})
#inputedData <- t(inputedData)
}else if(imputeM == "median"){
missvalue <- function(x,group) {
x[is.na(x) == TRUE ] <- 0
group = as.factor(as.numeric(group))
for (i in 1:dim(x)[1]){
for(j in 2:dim(x)[2]){
if(x[i,j] == 0 | sum(is.na(x[i,j])) >= 1){
#x[i,j][is.na(x[i,j]) == TRUE ] <- 0
x[i,j] <- tapply(as.numeric(x[,j]),group,median)[group[i]]
}
}
}
return(x)
}
inputedData = missvalue(imsamFP,classF)
inputedData = inputedData[,-1]
}
#message(date(), "The number of NA value in Data Profile after the initial imputation: ",
#sum(is.na(inputedData) | as.matrix(inputedData) <= 0))
if(sum(is.na(inputedData) | as.matrix(inputedData) <= 0) > 0)
{
inputedData[inputedData<=0] <- NA
mvd2 <- impute::impute.knn(inputedData[,1:ncol(inputedData)])
inputedData <- mvd2$data
#message(date(), "The number of NA value in Data Profile after the second imputation (KNN): ",
#sum(is.na(inputedData) | as.matrix(inputedData) <= 0))
}
#cat(date(), "\nImputation Finished!\n")
dat <- as.matrix(t(inputedData))
numX <- 1:dim(dat)[2]
if(QCspan > 0){
loessFit=function(x,y,QCspan,degree){
cn <- colnames(x)
####Check########
st_QC<-grep("QC",cn[1])
ed_QC<-grep("QC",cn[length(cn)])
if(length(st_QC)==0)
{
stop("Wrong: the first sample must be QC sample; please check ......");
}
if(length(ed_QC)==0)
{
stop("Wrong: the sample at the end of sequence must be QC sample; please check ......");
}
qcid <- grep("QC",cn)
for(i in 1:dim(x)[1]){
loe <- stats::loess(x[i,qcid]~qcid,span=QCspan,degree = degree)
yf <- stats::predict(loe,y)
x[i,] <- as.numeric(x[i,])/yf
}
loessDat = x
}
loessDat <- loessFit(x = dat,y=numX,QCspan = QCspan,degree = degree)
}else if(QCspan <= 0){
#message(date(),"\nWarning: The QCspan was set at '0'. \nThe LOESS based generalised cross-validation was used to avoid overfitting the observed data")
autoFit <- function(xl,y){
cn <- colnames(xl)
####Check########
st_QC<-grep("QC",cn[1])
ed_QC<-grep("QC",cn[length(cn)])
if(length(st_QC)==0)
{
stop("Wrong: the first sample must be QC sample; please check ......");
}
if(length(ed_QC)==0)
{
stop("Wrong: the sample at the end of sequence must be QC sample; please check ......");
}
qcid <- grep("QC",cn)
for(i in 1:dim(xl)[1]){
loe1 <- loess(xl[i,qcid]~qcid)
loe2 <- loe1
sp <- c(seq(0.05,0.75,0.01))
CVspan <-c()
for(j in 1:length(sp)){
mod <- stats::update(loe1, span = sp[j])
CVspan[j] = loessGCV(mod)[["gcv"]]
}
minG <- as.matrix(data.frame(sp,CVspan))
minG[!is.finite(minG)] <- max(minG[,2],na.rm = TRUE)
minspan <- minG[which.min(minG[,2]),1]
minspan
#spanNew <- spanCV(loe)$minimum
loeN <- stats::update(loe2, span = minspan)
yf <- predict(loeN,y)
xl[i,] <- as.numeric(xl[i,])/yf
}
loessDat = xl
#return(loessDat)
}
loessDat <- autoFit(xl = dat,y = numX)
}
###############
if(sum(is.na(loessDat) | as.matrix(loessDat) <= 0) > 0)
{
loessDat[loessDat<=0] <- NA
mvd2 <- impute::impute.knn(loessDat[,1:ncol(loessDat)])
loessDat <- mvd2$data
#message(date(),"The number of NA value in Data Profile after Loess Correction (KNN): ",
#sum(is.na(loessDat) | as.matrix(loessDat) <= 0))
}
#dirout.uni = paste(getwd(), "/statTarget/", sep = "")
#dir.create(dirout.uni)
#dirout.w = paste(getwd(), "/statTarget/shiftCor", sep="")
#dir.create(dirout.w)
#dirout.Bs = paste(getwd(), "/statTarget/shiftCor/Before_shiftCor", sep="")
#dir.create(dirout.Bs)
#dirout.As = paste(getwd(), "/statTarget/shiftCor/After_shiftCor", sep="")
#dir.create(dirout.As)
########### Out plot of each peak ############
#for(i in 1 :dim(dat)[1]){
# loplot(dat,loessDat,i)
#}
###############Raw output###########
raw_temp <- cbind(samPeno,inputedData)
nam_qc <- rownames(raw_temp)
QC_temp_raw <- grep("QC",nam_qc)
QC_temp_raw <- raw_temp[c(QC_temp_raw),]
raw_temp_qc <- QC_temp_raw[,-c(3,4)]
rownames(raw_temp_qc) <- NULL
#RSD30_CV=paste("shift_QC_raw",".csv", sep="")
#write.csv(raw_temp_qc, paste(dirout.Bs, RSD30_CV, sep="/"))
#cat(date(), "\nCalculation of CV distribution of raw peaks (QC)...\n")
#dirout.rs = paste(getwd(), "/statTarget/shiftCor/RSDresult/RSDdist_raw", sep="")
#dir.create(dirout.rs)
#setwd(dirout.rs)
#RsdCal(paste(dirout.Bs, RSD30_CV, sep="/"))
###############Loess output###########
lo_temp <- cbind(samPeno,t(loessDat))
nam_qc <- rownames(lo_temp)
QC_temp <- grep("QC",nam_qc)
QC_temp <- lo_temp[-c(QC_temp),]
lo_temp_sam <- QC_temp[,-c(2,4)]
rownames(lo_temp_sam) <- NULL
#RSD30_CV=paste("shift_sample_loess",".csv", sep="")
#write.csv(lo_temp_sam, paste(dirout.As, RSD30_CV, sep="/"))
return(lo_temp_sam)
QC_temp <- grep("QC",nam_qc)
QC_temp <- lo_temp[c(QC_temp),]
lo_temp_qc <- QC_temp[,-c(3,4)]
rownames(lo_temp_qc) <- NULL
#RSD30_CV=paste("shift_QC_loess",".csv", sep="")
#write.csv(lo_temp_qc, paste(dirout.As, RSD30_CV, sep="/"))
#cat(date(), "\nCalculation of CV distribution of corrected peaks (QC)...\n")
#dirout.rs = paste(getwd(), "/statTarget/shiftCor/RSDresult/RSDdist_cor", sep="")
#dir.create(dirout.rs)
#setwd(dirout.rs)
#RsdCal(paste(dirout.As, RSD30_CV, sep="/"))
#QC_temp <- grep("QC",nam_qc)
#QC_temp <- lo_temp[c(QC_temp),]
lo_temp_all <- lo_temp[,-c(2,4)]
rownames(lo_temp_all) <- NULL
#RSD30_CV=paste("shift_all_loess",".csv", sep="")
#write.csv(lo_temp_all, paste(dirout.As, RSD30_CV, sep="/"))
#write.table(lo_temp,"loess_out.csv",sep="/")
#write.table(lo_temp,"",sep="\t",quote=F)
#cat(date(), "\nCorrection Finished!\n")
#cat(date(),"\nAll done!\n")
##################Loess Plot########################
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.