R/MBImpute.r
In DanteR: Proteomics data analysis tool

# Model-based imputaion
# Ref:  "A statistical framework for protein quantitation in bottom-up MS-based
#        proteomics. Karpievitch Y, Stanley J, Taverner T, Huang J, Adkins JN,
#        Ansong C, Heffron F, Metz TO, Qian WJ, Yoon H, Smith RD, Dabney AR.
#        Bioinformatics 2009
#
# Written by Tom Taverner, Shelley Herbrich and Yuliya Karpievitch
# for Pacific Northwest National Lab.

MBimpute = function(m, treatment, protein_group=2, my.pi=0.05, compute_pi=TRUE){
# Imputes missing values 
# Expects the data to be filtered with model-based filtering prior to imputation
# Handles no sibling peptides with missing groups (think not this version)
# 
# Input:
#   input: An m x n matrix of intensities
#          DEPRICATED:(peptides x (n+2 samples)) matrix of expression data
#          where first column contains peptide identifier and second column
#          contains protein identifier
#   treatment:  vector indicating the treatment group of each sample ie [1 1 1 1 2 2 2 2...]
#   my.pi: PI value, estimate of the proportion of peptides missign completely at random,
#          as compared to censored at lower abundance levels
#  
# Output: list of:
#   y.impute: k x n matrix of 'complete' data (original and imputed values for missing data),
#             k <= m, as some peptides may have been filtered out due to low information content
#
# p-values computed after model-based filtering and imputation

prot.info <- get.ProtInfo(m)  # pepID, prID - not as in our text file data frame? here a mat
prot.info <- cbind(prot.info[,1], prot.info[,protein_group])
treatment <- get.factors(m)[,treatment] # yuliya: IS THIS RIGHT?


obj_metadata <- list(attr(m, "Row_Metadata"), attr(m, "Column_Metadata"))
obj_dimnames <- dimnames(m)

# calculate PI or used one passed in as parameter:
  if (!compute_pi){
  	my.pi <- eigen_pi(m, toplot=T)
  } # else should be taken from the user interface, can we verify it?

  # treatment factors
  n.treatment <- length(treatment)
  n.u.treatment <- length(unique(treatment))

  # Match to protein
  all.proteins <- unique(prot.info[,2])
  all.proteins <- all.proteins[order(all.proteins)]

  y_imputed <- NULL
  cat("Imputing...\n")

  for (k in 1:length(all.proteins)){
    prot <- all.proteins[k]
    pmid.matches <- prot.info[prot.info[,2]==prot,1]
    #idx.prot <- which(rownames(data) %in% pmid.matches)
    idx.prot <- which(rownames(m) %in% pmid.matches)
    # y_raw <- rbind(data[idx.prot,])
    y_raw <- m[idx.prot,,drop=F]
    y_info <- prot.info[idx.prot,,drop=F]
    # rownames(y_raw) <- rownames(data)[idx.prot]
    #rownames(y_raw) <- rownames(m)[idx.prot]
    if (nrow(y_raw) == 0) next
    
    #peptides and proteins of poor quality are removed prior to analysis 
    #estimate data parameters
    n.peptide <- nrow(y_raw)
    y <- as.vector(t(y_raw))
    n <- length(y)
    peptide <-rep(rep(1:n.peptide, each=n.treatment))

    #filter out min.missing
    n.present <- array(NA, c(n.peptide, n.u.treatment))
    colnames(n.present) <- unique(treatment)
    for(i in 1:n.peptide) for(j in 1:n.u.treatment) n.present[i,j] <- sum(!is.na(y [peptide==i & treatment==unique(treatment)[j]]))

    # remove peptides with completely missing group(s)
    present.min <- apply(n.present, 1, min)
    ii <- present.min > 0
    y_raw <- y_raw[ii,,drop=F] # reassign Y_raw to a submatrix of 1+ observations in each group
    if (nrow(y_raw) == 0) next

    # re-evaluate data parameters after filtering
    n.peptide <- nrow(y_raw)
    y <- as.vector(t(y_raw))
    n <- length(y)
    c.guess <- min(y, na.rm=T)
    peptide <-rep(rep(1:n.peptide, each=n.treatment))

    # re-calculate number of observed values per treatment group for each peptide
    # yuliya: may be able to use previous computation here for efficiency... but as long
    # as this works do not care to change at this time
    n.present <- array(NA, c(n.peptide, n.u.treatment))
    colnames(n.present) <- unique(treatment)
    for(i in 1:n.peptide){
      for(j in 1:n.u.treatment){
        n.present[i,j] <- sum(!is.na(y [peptide==i & treatment==unique(treatment)[j]]))
      }
    }

    # calculate pooled variance for each protein
    grp <- array(NA, c(1, n.u.treatment))
    for (j in 1:n.u.treatment){
      grp[j] <- sum(n.present[, j])
    }
    mpos <-which.max(grp)
    go <- protein_var(y_raw, treatment) # yuliya: function, see below
    overall_var <- go$overall_var

    num <-0
    den <- 0

    # calculate pooled variance for each peptide;
    # if only 1 onservation in a dx group assign the overall variance
    for(i in 1:n.peptide){
      y.i <- na.omit(y[peptide==i & treatment==unique(treatment)[mpos]])
      p2 <- var(y.i)
      if (is.na(p2)) p2 <- 0
      present <- length(y.i)
      num <- num+(p2*(present-1))
      den <- den + (present-1)
    }
    pep_var <- num/den
    if (pep_var==0) pep_var <- overall_var

    peptides.missing <- rowSums(is.na(y_raw))

    f.treatment <- factor(rep(treatment, n.peptide))
    f.peptide <- factor(peptide)

    # estimate rough model parameters
    # create model matrix for each protein and
    # remove any peptides with missing values
    ii <- (1:n)[is.na(y)]
    if (n.peptide != 1){
      X  <- model.matrix(~f.peptide + f.treatment, contrasts = list(f.treatment="contr.sum", f.peptide="contr.sum") )
    } else {
      X <- model.matrix(~f.treatment, contrasts=list(f.treatment="contr.sum"))
    }
    if(length(ii) > 0){
      y.c <- y[-ii]
      X.c <- X[-ii,]
    } else {
      y.c <- y
      X.c <- X
    }

    # calculate initial beta values and residuals
    beta <- drop(solve(t(X.c) %*% X.c) %*% t(X.c) %*% y.c)

    # compute initial delta's
    peptides.missing[peptides.missing==0] <- 0.9
    delta.y <- as.numeric(1/sqrt(pep_var*peptides.missing))
    dd <- delta.y[as.numeric(peptide)]


    # calculate cutoff values for each peptide
    c_hat = rep(NA, n.peptide)
    for(j in 1:n.peptide) {
      c_hat[j] = min(y_raw[j, ], na.rm = T)
    }
    c_h <- c_hat[as.numeric(peptide)]

    if(n.peptide==1){
      y.predict <- model.matrix(~f.treatment, contrasts=list(f.treatment="contr.sum"))%*% beta
    } else {
      y.predict <- model.matrix(~f.peptide + f.treatment,contrasts = list(f.treatment="contr.sum", f.peptide="contr.sum"))%*% beta
    }

    zeta <- dd*(c_h - y.predict)
    prob.cen <- pnorm(zeta, 0, 1)/(my.pi + (1-my.pi)*pnorm(zeta, 0, 1))
    choose.cen <- runif(n) < prob.cen
    set.cen <- is.na(y) &choose.cen
    set.mar <- is.na(y) &!choose.cen
    kappa <- my.pi + (1 - my.pi)*dnorm(zeta,0, 1)

    # compute information
    I_beta <- t(X) %*% diag(as.vector(dd^2*(1 - kappa*(1 + my.Psi.dash(zeta, my.pi))))) %*% X
    I_GRP <- I_beta[rev(rev(1:n.peptide+1)[1:(n.u.treatment - 1)]), rev(rev(1:n.peptide+1)[1:(n.u.treatment - 1)]), drop = F]
    if (!is.null(dim(I_GRP))) I_GRP = det(I_GRP)

    # Imputation: Replace missing values with random numbers drawn from the estimated likelihood model
    sigma <- 1/dd
    y.impute <- t(y_raw)
    if(sum(set.cen) > 0)
      y.impute[set.cen] <- rnorm.trunc(sum(set.cen), y.predict[set.cen], sigma[set.cen], hi=rep(c.guess, n)[set.cen])

    if(sum(set.mar) > 0)
      y.impute[set.mar] <- rnorm(n, y.predict, sigma)[set.mar]
      
    y.impute.return <- t(y.impute)

    #y.impute.ret <- cbind(y_info, y.impute.return)

    #row.names(y.impute.ret) <- row.names(y_raw)
    tryCatch(y_imputed <- rbind(y_imputed, y.impute.return), error = function(e){
      print(y.impute.return)
      print(dim(y_imputed))
      print(dim(y.impute.return))      
      stop("Error in rbind() for y.impute.net")      
    })
    
  } #end for each protein
  
  #y_imputed <- y_imputed[,-c(1,2), drop=FALSE]   # remove pr and pep ids from the datadrame, make a matrix
  #y_imputed_dimnames <- dimnames(y_imputed)

  #y_imputed <- array(as.numeric(y_imputed), dim(y_imputed))
  # y_imputed <- data.matrix(y_imputed) # as.matrix destroys the row/col info, so do that 1st

  #colnames(y_imputed) <-  obj_dimnames[[2]]
  #row.names(y_imputed) <-  y_imputed_dimnames[[1]]


  attr(y_imputed, "Row_Metadata") <- obj_metadata[[1]]
  attr(y_imputed, "Column_Metadata") <- obj_metadata[[2]]


  cat("Done imputing.\n")
  return(list(y_imputed=y_imputed,pi=as.matrix(my.pi)))
}
############# end imputation ###################
############## function pi #############

if(0){
eigen_pi <- function(m, toplot=T){ 
# Compute PI - proportion of observations missing at random
# INPUT: m - matrix of abundances, numsmaples x numpeptides
#       toplot - T/F plot mean vs protportion missing, curve and PI 
# OUTPUT: pi - 
# 
# Shelley Herbrich, June 2010, created for EigenMS and TamuQ 
#
# (1) compute 1) ave of the present values from each petide
#             2) number of missing and present values for each peptide
  
  # m = m[,-(1:2)] # yuliya: no 2 columns of ids, already on log scale
  # m = log(m)
  #remove completely missing rows
  m = m[rowSums(m, na.rm=T)!=0,]

  pepmean <- apply(m, 1, mean, na.rm=T)
  propmiss <- rowSums(is.na(m))/ncol(m)

  smooth_span <- (0.4)
  fit <- lowess(pepmean, propmiss, f=smooth_span)
  PI <- fit$y[fit$x==max(pepmean)]

  count <- 1
  while (PI<=0){
    smooth_span <- smooth_span-.1
    fit <- lowess(pepmean, propmiss, f=smooth_span)
    PI <- fit$y[fit$x==max(pepmean)]
    count <- count + 1
    if (count > 500) break
  }

  if (toplot){
  st <- paste("PI: ", PI) 
  plot(pepmean, propmiss, xlab="x", ylab="y", cex=0.5) #plot data point
  lines(fit)
  title("Lowess Regression", sub = st,
      cex.main = 2,   font.main= 3, col.main= "purple",
      cex.sub = 1, font.sub = 3, col.sub = "red")
  }
  return (pi=PI)
}

}
######################################################
protein_var <- function(Y_raw, treatment){
  n.peptide <- nrow(Y_raw)
  y <- as.vector(t(Y_raw))
  n <- length(y)
  n.treatment <- length(treatment)
  n.u.treatment <- length(unique(treatment))
  peptide <-rep(rep(1:n.peptide, each=n.treatment))

  n.present <- array(NA, c(n.peptide, n.u.treatment))
  colnames(n.present) <- unique(treatment)
  for(i in 1:n.peptide) for(j in 1:n.u.treatment) n.present[i,j] <- sum(!is.na(y [peptide==i & treatment==unique(treatment)[j]]))
  peptides.missing <- rowSums(is.na(Y_raw))

  f.treatment <- factor(rep(treatment, n.peptide))
  f.peptide <- factor(peptide)

  # estimate rough model parameters
  # create model matrix for each protein and
  # remove any peptides with missing values
  ii <- (1:n)[is.na(y)]
  if (n.peptide != 1){
    X  <- model.matrix(~f.peptide + f.treatment, contrasts = list(f.treatment="contr.sum", f.peptide="contr.sum") )
  } else {
    X <- model.matrix(~f.treatment, contrasts=list(f.treatment="contr.sum"))
  }
  if(length(ii) > 0){
    y.c <- y[-ii]
    X.c <- X[-ii,]
  } else {
    y.c <- y
    X.c <- X
  }

  # calculate initial beta values and residuals
  beta <- drop(solve(t(X.c) %*% X.c) %*% t(X.c) %*% y.c)
  Y_hat <- X.c %*% beta
  Y_temp <- Y_raw
  Y_temp <- as.numeric(t(Y_temp))
  Y_temp[!is.na(Y_temp)] <- Y_hat
  Y_temp <- matrix(Y_temp, nrow = n.peptide, byrow = T)
  Y_hat <- Y_temp
  ee <- Y_raw - Y_hat

  effects <- X.c %*% beta
  resid <- y.c - effects
  overall_var <- var(resid)
  return(list(overall_var=det(overall_var)))
}
################################################
my.Psi = function(x, my.pi){
# calculates Psi
exp(log(1-my.pi)  + dnorm(x, 0, 1, log=T) - log(my.pi + (1 - my.pi) * pnorm(x, 0, 1) ))
}
# end my.Psi

my.Psi.dash = function(x, my.pi){
# calculates the derivative of Psi
-my.Psi(x, my.pi) * (x + my.Psi(x, my.pi))
}
# end my.Psi.dash

phi = function(x){dnorm(x)}

rnorm.trunc <- function (n, mu, sigma, lo=-Inf, hi=Inf){
# Calculates truncated noraml
  p.lo <- pnorm (lo, mu, sigma)
  p.hi <- pnorm (hi, mu, sigma)
  u <- runif (n, p.lo, p.hi)
  return (qnorm (u, mu, sigma))
}
# End rnorm.trunc



################################################
# dialog for DanteR
################################################
MBimpute.dialog = list( title='Model-based Imputation',
  m.dataframeItem=NULL, label='Data to impute',
  signal = list("default", "get.dataset.factors", "treatment", user.data=list(include.primary=FALSE)),
  signal = list("default", "get.dataset.row.metadata.fields", "protein_group", user.data=list(include.primary=FALSE)),    
  treatment.choiceItem=NULL, label='Factors',
  protein_group.choiceItem = NULL, label = "Field Containing Proteins",   
  compute_pi.trueFalseItem=FALSE, label='Manually Estimate PI',
   tooltip = "If this is checked, the user can set the estimated random missingness probability",
  signal = c("default", "toggle.sensitive", "my.pi"),
    my.pi.numericItem="0.05", label='PI', indent=10
)




get_pi = function (choice){
		if (choice=="Yes"){
		my.pi = my.pi
	}else {
		my.pi = run.dialog(enter_pi)$retval
	}
  return(my.pi)
}

enter_pi = function(new_val){
	my.pi = new_val
	return(my.pi)
}
enter_pi.dialog=list(title='New PI Value', new_val.numericItem=NULL, label='Enter new PI value:')