R/MAR.est.R

#' Missing At Random (MAR) proportion estimation based on technical replicates.
#'
#' MAR.est estimates the proportion of missing peaks at random (MAR) caused by preprocessing tools with exactly two technical replicates per sample.
#' @param abundance The full abundance matrix without missing value, with features in rows and samples in columns.
#' @param sample A vector of characters or integers. It is the sample name for each pair of replicates.
#' @param log.scale A scalar or vector of proportions. It is the total percentage of missing peaks throughout the full matrix.
#' @param violin.plot Logical, whether to generate violin and box plots to visualize abundance distribution of missing and nonmissing peaks.
#'
#' @import utils
#' @import ggplot2
#' @import reshape2
#'
#'
#' @return
#'    \item{MAR.Proportion}{Estimated MAR proportion}
#'    \item{plot}{Violin and box plots generated by ggplot2}
#' @examples
#' data('replicates')
#' # replicates contains mass specturm log abundance of 85 peptides
#' # with missing values for 4 pairs of technical replicates.
#'
#'
#' MAR=MAR.est(replicates,sample=rep(1:4,each=2),log.scale=FALSE,violin.plot=TRUE)
#' # Estimates the MAR proportion in the 4 pairs of replicates and output violin/box plots object.
#'
#' print(MAR$plot)
#' # Print violin/box plots
#'
#' @export
#' @importFrom stats na.omit predict


MAR.est<-function(abundance,sample,log.scale=TRUE,violin.plot=FALSE){

  if(!requireNamespace('ggplot2')) {
    install.packages('ggplot2')
    if(!requireNamespace('ggplot2',character.only = TRUE)) stop("Package 'ggplot2' not found")}
  if(!requireNamespace('reshape2')) {
    install.packages('reshape2')
    if(!requireNamespace('reshape2',character.only = TRUE)) stop("Package 'reshape2' not found")}

  if(length(sample)!=ncol(abundance)){
    stop('Abundance matrix colnames must match with sample index')
  }
  sample=as.character(sample)
  if(sum(table(sample)!=2)>0){
    stop('Use two replicates per sample')
  }

  replicate.name=paste(sort(sample),rep(c('a','b'),length(sample)/2),sep = '.')
  abundance=abundance[,order(sample)]
  colnames(abundance)=replicate.name
  abundance[abundance==0]=NA

  if(log.scale){
  edata=log2(abundance)
  }else {edata=abundance}

  n_edata=nrow(edata)
  n_sample=ncol(edata)

  result_edata=matrix(NA,ncol=n_sample,nrow=n_edata)
  colnames(result_edata)=colnames(edata)
  for (i in 1:(n_sample/2)){
    keep=which(!is.na(edata[,(2*i-1)])) #index for non-NA obs
    result_edata[,(2*i)]=replace(edata[,(2*i)],keep,NA) #keep associated value for each missing replicate
    keep1=which(!is.na(edata[,(2*i)]))
    result_edata[,(2*i-1)]=replace(edata[,(2*i-1)],keep1,NA)
  }
  result_edata=as.data.frame(result_edata)

  test_edata=matrix(NA,nrow=n_edata,ncol=n_sample/2)
  colnames(test_edata)=colnames(result_edata)[seq(1,n_sample,2)]
  for (i in 1:(n_sample/2)){
    y1=apply(result_edata[,(2*i-1):(2*i)],1,function(x)sum(x,na.rm = TRUE))
    y1[y1==0]=NA
    n_row=length(na.omit(y1))
    test_edata[1:n_row,i]=as.vector(na.omit(y1))
  }
  test_edata=as.data.frame(test_edata)

  nomiss_edata=matrix(NA,ncol=n_sample,nrow=n_edata)
  colnames(nomiss_edata)=colnames(edata)
  for (i in 1:(n_sample/2)){
    keep=which(is.na(edata[,(2*i-1)])) #index for non-NA obs
    nomiss_edata[,(2*i)]=replace(edata[,(2*i)],keep,NA) #keep associated value for each missing replicate
    keep1=which(is.na(edata[,(2*i)]))
    nomiss_edata[,(2*i-1)]=replace(edata[,(2*i-1)],keep1,NA)
  }
  nomiss_edata=as.data.frame(nomiss_edata)

  colnames(test_edata)=paste('Missing',colnames(test_edata),sep = '.')
  colnames(nomiss_edata)=paste('Full',colnames(nomiss_edata),sep = '.')

  random.pct=colSums(!is.na(test_edata))/dim(edata)[1]
  names(random.pct)=paste('S',names(table(sample)),sep = '')

  if(violin.plot){

    all.data=cbind(nomiss_edata,test_edata)
    plot.data=melt(all.data)
    plot.data=plot.data[!is.na(plot.data$value),]
    plot.data$variable=droplevels(plot.data$variable)
    plot.data$Compound=ifelse(grepl('Missing',plot.data$variable),'Random Missing','Full')
    plot.data$sample=gsub('Full','S',plot.data$variable)
    plot.data$sample=gsub('Missing','S',plot.data$sample)
    plot.data$sample=gsub('.a','',plot.data$sample)
    plot.data$sample=gsub('.b','',plot.data$sample)


    dodge <- position_dodge(width = 0.8)
    p=ggplot(plot.data,aes(x=sample, y=plot.data$value,color=plot.data$Compound))+geom_violin(position = dodge)+geom_boxplot(width=0.1,position = dodge)+theme(legend.position="bottom")
    p=p+labs(x='Replicates Pair',y='Log Abundance')


  }
  if(violin.plot){
  return(list(MAR.Proportion=random.pct,plot=p))
  } else{return(random.pct)}


}

Try the GMSimpute package in your browser

Any scripts or data that you put into this service are public.

GMSimpute documentation built on May 1, 2019, 10:13 p.m.