R/MAR.est.R In GMSimpute: Generalized Mass Spectrum Missing Peaks Abundance Imputation

#' Missing At Random (MAR) proportion estimation based on technical replicates.
#'
#' MAR.est estimates the proportion of missing peaks at random (MAR) caused by preprocessing tools with exactly two technical replicates per sample.
#' @param abundance The full abundance matrix without missing value, with features in rows and samples in columns.
#' @param sample A vector of characters or integers. It is the sample name for each pair of replicates.
#' @param log.scale A scalar or vector of proportions. It is the total percentage of missing peaks throughout the full matrix.
#' @param violin.plot Logical, whether to generate violin and box plots to visualize abundance distribution of missing and nonmissing peaks.
#'
#' @import utils
#' @import ggplot2
#' @import reshape2
#'
#'
#' @return
#'    \item{MAR.Proportion}{Estimated MAR proportion}
#'    \item{plot}{Violin and box plots generated by ggplot2}
#' @examples
#' data('replicates')
#' # replicates contains mass specturm log abundance of 85 peptides
#' # with missing values for 4 pairs of technical replicates.
#'
#'
#' MAR=MAR.est(replicates,sample=rep(1:4,each=2),log.scale=FALSE,violin.plot=TRUE)
#' # Estimates the MAR proportion in the 4 pairs of replicates and output violin/box plots object.
#'
#' print(MAR$plot) #' # Print violin/box plots #' #' @export #' @importFrom stats na.omit predict MAR.est<-function(abundance,sample,log.scale=TRUE,violin.plot=FALSE){ if(!requireNamespace('ggplot2')) { install.packages('ggplot2') if(!requireNamespace('ggplot2',character.only = TRUE)) stop("Package 'ggplot2' not found")} if(!requireNamespace('reshape2')) { install.packages('reshape2') if(!requireNamespace('reshape2',character.only = TRUE)) stop("Package 'reshape2' not found")} if(length(sample)!=ncol(abundance)){ stop('Abundance matrix colnames must match with sample index') } sample=as.character(sample) if(sum(table(sample)!=2)>0){ stop('Use two replicates per sample') } replicate.name=paste(sort(sample),rep(c('a','b'),length(sample)/2),sep = '.') abundance=abundance[,order(sample)] colnames(abundance)=replicate.name abundance[abundance==0]=NA if(log.scale){ edata=log2(abundance) }else {edata=abundance} n_edata=nrow(edata) n_sample=ncol(edata) result_edata=matrix(NA,ncol=n_sample,nrow=n_edata) colnames(result_edata)=colnames(edata) for (i in 1:(n_sample/2)){ keep=which(!is.na(edata[,(2*i-1)])) #index for non-NA obs result_edata[,(2*i)]=replace(edata[,(2*i)],keep,NA) #keep associated value for each missing replicate keep1=which(!is.na(edata[,(2*i)])) result_edata[,(2*i-1)]=replace(edata[,(2*i-1)],keep1,NA) } result_edata=as.data.frame(result_edata) test_edata=matrix(NA,nrow=n_edata,ncol=n_sample/2) colnames(test_edata)=colnames(result_edata)[seq(1,n_sample,2)] for (i in 1:(n_sample/2)){ y1=apply(result_edata[,(2*i-1):(2*i)],1,function(x)sum(x,na.rm = TRUE)) y1[y1==0]=NA n_row=length(na.omit(y1)) test_edata[1:n_row,i]=as.vector(na.omit(y1)) } test_edata=as.data.frame(test_edata) nomiss_edata=matrix(NA,ncol=n_sample,nrow=n_edata) colnames(nomiss_edata)=colnames(edata) for (i in 1:(n_sample/2)){ keep=which(is.na(edata[,(2*i-1)])) #index for non-NA obs nomiss_edata[,(2*i)]=replace(edata[,(2*i)],keep,NA) #keep associated value for each missing replicate keep1=which(is.na(edata[,(2*i)])) nomiss_edata[,(2*i-1)]=replace(edata[,(2*i-1)],keep1,NA) } nomiss_edata=as.data.frame(nomiss_edata) colnames(test_edata)=paste('Missing',colnames(test_edata),sep = '.') colnames(nomiss_edata)=paste('Full',colnames(nomiss_edata),sep = '.') random.pct=colSums(!is.na(test_edata))/dim(edata)[1] names(random.pct)=paste('S',names(table(sample)),sep = '') if(violin.plot){ all.data=cbind(nomiss_edata,test_edata) plot.data=melt(all.data) plot.data=plot.data[!is.na(plot.data$value),]
plot.data$variable=droplevels(plot.data$variable)
plot.data$Compound=ifelse(grepl('Missing',plot.data$variable),'Random Missing','Full')
plot.data$sample=gsub('Full','S',plot.data$variable)
plot.data$sample=gsub('Missing','S',plot.data$sample)
plot.data$sample=gsub('.a','',plot.data$sample)
plot.data$sample=gsub('.b','',plot.data$sample)

dodge <- position_dodge(width = 0.8)
p=ggplot(plot.data,aes(x=sample, y=plot.data$value,color=plot.data$Compound))+geom_violin(position = dodge)+geom_boxplot(width=0.1,position = dodge)+theme(legend.position="bottom")
p=p+labs(x='Replicates Pair',y='Log Abundance')

}
if(violin.plot){
return(list(MAR.Proportion=random.pct,plot=p))
} else{return(random.pct)}

}

Try the GMSimpute package in your browser

Any scripts or data that you put into this service are public.

GMSimpute documentation built on May 1, 2019, 10:13 p.m.