R/batch_analysis_fnxs.R
In Stickbreaker: Fits Stickbreaking, Multiplicative and Additive Models to Data

Documented in analyze.mult.add.data.batch analyze.stick.data.batch

#' Analyze batch data generated by \code{\link{sim.fit.stick.data.batch}}
#'
#' @param mut.vals Vector of number of mutations to simulate
#' @param coe.vals Vector of stickbreaking coefficients to analyze
#' @param sig.vals Vector of sigma values to analyze
#' @param fit.methods Vector of all methods of estimating d to then fit model and output results.
#' Accepts "MLE", "RDB", "max", "seq", "RDB.all" and "All". "All" does all methods. Default is "seq". Case sensitive.
#' Note you should use same fit.methods (or a subset of) as used when generating data in
#' @param d.range Search range for d.
#' @param d.true The distance to the boundary (d) used in simulations
#' \code{\link{sim.fit.stick.data.batch}}.
#' @param inpath The path to read input files from. Generally same as \code{outpath} from
#' \code{\link{sim.fit.stick.data.batch}}.
#' @param relative \code{TRUE/FALSE}. Should bias and rMSE be divided by parameter value to make them relative values?
#' @param coes Vector of stickbreaking coefficients for each mutation
#' @return List: \cr
#' [[1]] \code{error.table} A data frame with the parametric conditions and the failure rate, bias and rMSE
#' for the parameters d, the stickbreaking coefficients, and sigma. \cr
#' [[2]] \code{seq.method.props} A data frame with the proportion of sequential estimates coming from the MLE
#' RDB and max methods for each parametric condition. If "seq" is not among the fit.methods, the dataframe is
#' returned but empty.
#' @details This function assesses bias and relative root mean squared error (rMSE) from the data generated by
#' \code{\link{sim.fit.stick.data.batch}}. It also determines the failure rate for each method.
#' An estimate is considered invalid (a failure) when it does not fall in the range \code{d.range}. The
#' function also returns the data frame \code{seq.method.props} that indicates for the sequential method,
#' what proportion of the estimates come from the MLE, RDB and max methods. If "seq" is not part of
#' fit.methods (and if fit.methods <- "All" then it is), this dataframe is empty but still returned.
#' @importFrom stats as.formula dnorm lm median optimize pf predict rnorm runif sigma
#' @importFrom utils read.table write.table
#' @export

analyze.stick.data.batch <- function(mut.vals, coe.vals, sig.vals, inpath, fit.methods, d.range, d.true, relative, coes){

  if (fit.methods[1] == "All"){
    fit.methods2 <- c("MLE", "RDB", "max", "seq", "RDB.all")
  } else{
    fit.methods2 <- fit.methods
  }

  error.columns <- c("n.muts", "coes", "sigma", "d.true", "method", "n.reps", "fail.rate", "d.bias", "d.rMSE", "coe.bias", "coe.rMSE", "sig.bias", "sig.rMSE")
  parm.sets <- expand.grid(n.muts=mut.vals, coe=coe.vals, sig=sig.vals, d.true=d.true, method=fit.methods2)
  parm.sets.string <- apply(parm.sets, 1, function(x) paste(x, collapse="_"))
  error.table <- as.data.frame(matrix(nrow=length(parm.sets[,1]), ncol=length(error.columns)))
  colnames(error.table) <- error.columns
  error.table[,1:length(parm.sets[1,])] <- parm.sets

  er.cols2 <- c("n.muts", "coes", "sigma", "d.true", "p.MLE", "p.RDB", "p.max", "p.seq", "p.RDB.all")
  parm.sets2 <- expand.grid(n.muts=mut.vals, coe=coe.vals, sig=sig.vals, d.true=d.true)
  parm.sets.string2 <- apply(parm.sets2, 1, function(x) paste(x, collapse="_"))
  seq.method.props <- as.data.frame(matrix(nrow=length(parm.sets2[,1]), ncol=length(er.cols2)))
  colnames(seq.method.props) <- er.cols2
  seq.method.props[,1:length(parm.sets2[1,])] <- parm.sets2


  for (method.i in 1:length(fit.methods2)){   # loop over each method
    for (mut.i in 1:length(mut.vals)){         # loop over each number of mutations
      infile <- paste(inpath, "_", fit.methods2[method.i], "_", mut.vals[mut.i], "muts.txt", sep="")
      data <- read.table(file=infile, header=TRUE)
      for (coe.i in 1:length(coe.vals)){
        for (sig.i in 1:length(sig.vals)){
          sub.data <- subset(data, (coes==coe.vals[coe.i] & sigma==sig.vals[sig.i]))
          sub.data.raw <- sub.data
          parm.set.string <- paste(c(sub.data[1, 2:5], fit.methods2[method.i]), collapse="_")
          error.row <- which(parm.sets.string == parm.set.string)

          error.table$n.reps[error.row] <- length(sub.data$d.hat)
          error.table$fail.rate[error.row] <- (length(which(sub.data$d.hat >= 0.99*d.range[2] | sub.data$d.hat <= 1.1*d.range[1])))/length(sub.data$d.hat)
          nf.rows <- which(!(sub.data$d.hat > 0.99*d.range[2] | sub.data$d.hat < 1.1*d.range[1]))
          sub.data <- sub.data[nf.rows,]

          if (length(nf.rows)>0){
            denom <- 1
            if (relative==TRUE){
              denom <- d.true
            }
            error.table$d.bias[error.row] <- mean((sub.data$d.hat - sub.data$d.true))/denom
            error.table$d.rMSE[error.row] <-mean((abs(sub.data$d.hat - sub.data$d.true)))/denom

            coe.col1 <- which(colnames(sub.data)=="u.hats.1")
            coe.cols <- seq(coe.col1,(coe.col1+mut.vals[mut.i]-1))
            coe.errors <- apply(sub.data[,coe.cols], 2, function(x) (x-sub.data$coe))
            denom <- 1
            if (relative==TRUE){denom <- coe.vals[coe.i]}
            error.table$coe.bias[error.row] <- mean(coe.errors)/denom
            error.table$coe.rMSE[error.row] <- mean(abs(coe.errors))/denom
            denom <- 1
            if (relative==TRUE){denom <- sig.vals[sig.i]}
            error.table$sig.bias[error.row] <- mean((sub.data$sig.hat - sub.data$sigma))/denom
            error.table$sig.rMSE[error.row] <- mean(abs(sub.data$sig.hat - sub.data$sigma))/denom
          }

          if (fit.methods2[method.i]=="seq"){
            parm.string2 <- paste(sub.data.raw[1,2:5], collapse="_")
            seq.methods.row <- which(parm.sets.string2==parm.string2)
            nreps <- length(sub.data.raw$method)
            seq.method.props$p.MLE[seq.methods.row] <- length(which(sub.data.raw$method=="MLE"))/nreps
            seq.method.props$p.RDB[seq.methods.row] <- length(which(sub.data.raw$method=="RDB"))/nreps
            seq.method.props$p.max[seq.methods.row] <- length(which(sub.data.raw$method=="max"))/nreps
            seq.method.props$p.seq[seq.methods.row] <- length(which(sub.data.raw$method=="seq"))/nreps
            seq.method.props$p.RDB.all[seq.methods.row] <- length(which(sub.data.raw$method=="RDB.all"))/nreps
          }
        }
      }
    }
  }
  return(list(error.table=error.table, seq.method.props=seq.method.props))
}




#' Analyze batch data generated by \code{\link{sim.fit.mult.add.data.batch}}
#'
#' @param mut.vals Vector of number of mutations to simulate
#' @param coe.vals Vector of stickbreaking coefficients to analyze
#' @param sig.vals Vector of sigma values to analyze
#' @param inpath The path to read input files from. Generally same as \code{outpath} from
#' \code{\link{sim.fit.stick.data.batch}}.
#' @param relative \code{TRUE/FALSE}. Should bias and rMSE be divided by parameter value to make them relative values?
#' @param coes Vector of stickbreaking coefficients for each mutation
#' @return \code{error.table} A data frame with the parametric conditions and the failure rate, bias and rMSE
#' for the parameters d, the stickbreaking coefficients, and sigma. \cr
#' @details This function assesses bias and relative root mean squared error (rMSE) from the data generated by
#' \code{\link{sim.fit.mult.add.data.batch}}.
#' @export

analyze.mult.add.data.batch <- function(mut.vals, coe.vals, sig.vals, inpath, relative, coes){

  error.columns <- c("n.muts", "coes", "sigma", "n.reps", "coe.bias", "coe.rMSE", "sig.bias", "sig.rMSE")
  parm.sets <- expand.grid(n.muts=mut.vals, coe=coe.vals, sig=sig.vals)
  parm.sets.string <- apply(parm.sets, 1, function(x) paste(x, collapse="_"))
  error.table <- as.data.frame(matrix(nrow=length(parm.sets[,1]), ncol=length(error.columns)))
  colnames(error.table) <- error.columns
  error.table[,1:length(parm.sets[1,])] <- parm.sets

  for (mut.i in 1:length(mut.vals)){         # loop over each number of mutations
    infile <- paste(inpath, "_", mut.vals[mut.i], "muts.txt", sep="")
    data <- read.table(file=infile, header=TRUE)
    for (coe.i in 1:length(coe.vals)){
      for (sig.i in 1:length(sig.vals)){
        sub.data <- subset(data, (coes==coe.vals[coe.i] & sigma==sig.vals[sig.i]))
        parm.set.string <- paste(sub.data[1, 3:5], collapse="_")
        error.row <- which(parm.sets.string == parm.set.string)
        error.table$n.reps[error.row] <- length(sub.data$n.muts)


        coe.col1 <- which(colnames(sub.data) %in% c("s.hats.1", "w.hats.1"))
        coe.cols <- seq(coe.col1,(coe.col1+mut.vals[mut.i]-1))
        coe.errors <- apply(sub.data[,coe.cols], 2, function(x) (x-sub.data$coe))
        denom <- 1
        if (relative==TRUE){denom <- coe.vals[coe.i]}
        error.table$coe.bias[error.row] <- mean(coe.errors)/denom
        error.table$coe.rMSE[error.row] <- mean(abs(coe.errors))/denom
        denom <- 1
        if (relative==TRUE){denom <- sig.vals[sig.i]}
        error.table$sig.bias[error.row] <- mean((sub.data$sig.hat - sub.data$sigma))/denom
        error.table$sig.rMSE[error.row] <- mean(abs(sub.data$sig.hat - sub.data$sigma))/denom
      }  # next sig.i
    }  #next coe.i
  }  #next mut.i
  return(error.table)
}