R/designSampleSizeTMT.R

Defines functions .getNumSample .calculatePower .getMedianSigmaSubject .getMedianSigmaRun .getVarComponentTMT designSampleSizeTMT

Documented in .calculatePower designSampleSizeTMT .getMedianSigmaRun .getMedianSigmaSubject .getNumSample .getVarComponentTMT

#' Planning future experimental designs of Tandem Mass Tag (TMT) experiments acquired with Data-Dependent Acquisition (DDA or shotgun)
#'
#' @description Calculate sample size for future experiments of a TMT experiment 
#' based on intensity-based linear model. Two options of the calculation: 
#' (1) number of biological replicates per condition, 
#' (2) power.
#' 
#' @param data 'FittedModel' in testing output from function groupComparisonTMT.
#' @param desiredFC the range of a desired fold change which includes the lower 
#' and upper values of the desired fold change.
#' @param FDR a pre-specified false discovery ratio (FDR) to control the overall 
#' false positive rate. Default is 0.05
#' @param numSample minimal number of biological replicates per condition. 
#' TRUE represents you require to calculate the sample size for this category, 
#' else you should input the exact number of biological replicates.
#' @param power a pre-specified statistical power which defined as the probability 
#' of detecting a true fold change. TRUE represent you require to calculate the power 
#' for this category, else you should input the average of power you expect. Default is 0.9
#' @inheritParams .documentFunction
#' 
#' @details The function fits the model and uses variance components to calculate 
#' sample size. The underlying model fitting with intensity-based linear model with 
#' technical MS run replication. Estimated sample size is rounded to 0 decimal.
#' The function can only obtain either one of the categories of the sample size 
#' calculation (numSample, numPep, numTran, power) at the same time.
#' 
#' @return data.frame - sample size calculation results including varibles:
#' desiredFC, numSample, FDR,  and power.
#' 
#' @importFrom stats median
#' @importFrom plotly ggplotly style add_trace plot_ly subplot layout
#' 
#' @export
#' 
#' @examples
#' data(input.pd)
#' # use protein.summarization() to get protein abundance data
#' quant.pd.msstats = proteinSummarization(input.pd,
#'                                         method="msstats",
#'                                         global_norm=TRUE,
#'                                         reference_norm=TRUE)
#' 
#' test.pairwise = groupComparisonTMT(quant.pd.msstats, save_fitted_models = TRUE)
#' head(test.pairwise$ComparisonResult)
#' 
#' ## Calculate sample size for future experiments:
#' #(1) Minimal number of biological replicates per condition
#' designSampleSizeTMT(data=test.pairwise$FittedModel, numSample=TRUE,
#'                  desiredFC=c(1.25,1.75), FDR=0.05, power=0.8)
#' #(2) Power calculation
#' designSampleSizeTMT(data=test.pairwise$FittedModel, numSample=2,
#'                  desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE)
#'           
designSampleSizeTMT = function(
    data, desiredFC, FDR = 0.05, numSample = TRUE, power = 0.9,
    use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL
) {
  
  var_component = .getVarComponentTMT(data)

  median_sigma_error = median(var_component[["Error"]], na.rm = TRUE)
  median_sigma_subject = .getMedianSigmaSubject(var_component)
  median_sigma_run = .getMedianSigmaRun(var_component)
  
  ## power calculation
  if (isTRUE(power)) {
    delta = log2(seq(desiredFC[1], desiredFC[2], 0.025))
    desiredFC = 2 ^ delta
    power_output = .calculatePower(desiredFC, FDR, delta, median_sigma_error, 
                                   median_sigma_subject, median_sigma_run, 
                                   numSample)        
    var_comp = (median_sigma_error / numSample + median_sigma_subject / numSample + median_sigma_run / numSample)
    CV = round( (2 * var_comp) / desiredFC, 3)
    sample_size = data.frame(desiredFC, numSample, FDR, 
                             power = power_output, CV)
  }	
  
  if (is.numeric(power)) {
    # delta = log2(seq(desiredFC[1], desiredFC[2], 0.025))
    delta = log2(seq(desiredFC[1], desiredFC[2], 0.001))
    desiredFC = 2 ^ delta
    ## Large portion of proteins are not changing
    m0_m1 = 99 ## it means m0/m1=99, m0/(m0+m1)=0.99
    alpha = power * FDR / (1 + (1 - FDR) * m0_m1)
    if (isTRUE(numSample)) {
      numSample = .getNumSample(desiredFC, power, alpha, delta,
                                median_sigma_error, median_sigma_subject,
                                median_sigma_run)
      var_comp = (median_sigma_error / numSample + median_sigma_subject / numSample + median_sigma_run / numSample)
      CV = round(2 * var_comp / desiredFC, 3)
      sample_size = data.frame(desiredFC, numSample, FDR, power, CV)
    }
  } 
  sample_size
}

#' Get variances from models fitted by the groupComparison function
#' @param fitted_models FittedModels element of groupComparison output
#' @keywords internal
.getVarComponentTMT = function(fitted_models) {
  Protein = NULL
  
  result = data.table::rbindlist(
    lapply(fitted_models, function(fit) {
      if (!is.null(fit)) {
        if (!is(fit, "lmerMod")) {
          error = summary(fit)$sigma^2
          subject <- NA
          group_subject <- NA
          run <- NA
          mix_run <- NA
        } else {
          stddev = c(sapply(lme4::VarCorr(fit), function(el) attr(el, "stddev")), 
                     attr(lme4::VarCorr(fit), "sc"))
          error = stddev[names(stddev) == ""]^2
          if (any(names(stddev) %in% "SUBJECT.(Intercept)")) {
            subject = stddev["SUBJECT.(Intercept)"]^2
          } else {
            subject = NA
          }
          if (any(names(stddev) %in% "SUBJECT:GROUP.(Intercept)")) {
            group_subject = stddev["SUBJECT:GROUP.(Intercept)"]^2
          } else {
            group_subject = NA
          }
          if (any(names(stddev) %in% "Run.(Intercept)")) {
            run = stddev["Run.(Intercept)"]^2
          } else {
            run = NA
          }
          if (any(names(stddev) %in% "Mixture:TechRepMixture.(Intercept)")) {
            mix_run = stddev["Mixture:TechRepMixture.(Intercept)"]^2
          } else {
            mix_run = NA
          }
          
        }
        list(Error = error,
             Subject = subject,
             GroupBySubject = group_subject,
             Run = run,
             RunByMixture=mix_run)
      } else {
        NULL
      }
    })
  )
  result[, Protein := 1:.N]
  result
}

#' Get median per subject or group by subject
#' @param var_component data.frame, output of .getVarComponent
#' @importFrom stats median
#' @keywords internal
.getMedianSigmaRun = function(var_component) {
  if (sum(!is.na(var_component[, "RunByMixture"])) > 0) {
    median_sigma_run = median(var_component[["RunByMixture"]], na.rm=TRUE)
  } else {
    if (sum(!is.na(var_component[, "Run"])) > 0) {
      median_sigma_run = median(var_component[["Run"]], na.rm=TRUE)
    } else {
      median_sigma_run = 0
    }
  }
  median_sigma_run
}

#' Get median per run or run by mix
#' @param var_component data.frame, output of .getVarComponent
#' @importFrom stats median
#' @keywords internal
.getMedianSigmaSubject = function(var_component) {
  if (sum(!is.na(var_component[, "GroupBySubject"])) > 0) {
    median_sigma_subject = median(var_component[["GroupBySubject"]], na.rm=TRUE)
  } else {
    if (sum(!is.na(var_component[, "Subject"])) > 0) {
      median_sigma_subject = median(var_component[["Subject"]], na.rm=TRUE)
    } else {
      median_sigma_subject = 0
    }
  }
  median_sigma_subject
}

#' Power calculation
#' @inheritParams designSampleSizeTMT
#' @param delta difference between means (?)
#' @param median_sigma_error median of error standard deviation
#' @param median_sigma_subject median standard deviation per subject
#' @importFrom stats qnorm
#' @keywords internal
.calculatePower = function(desiredFC, FDR, delta, median_sigma_error, 
                           median_sigma_subject, median_sigma_run, numSample) {
  m0_m1 = 99
  t = delta / sqrt(2 * (median_sigma_error / numSample + median_sigma_subject / numSample + median_sigma_run / numSample))
  powerTemp = seq(0, 1, 0.01)
  power = numeric(length(t))
  for (i in seq_along(t)) {
    diff = qnorm(powerTemp) + qnorm(1 - powerTemp * FDR / (1 + (1 - FDR) * m0_m1) / 2) - t[i]
    min(abs(diff), na.rm = TRUE)
    power[i] = powerTemp[order(abs(diff))][1]
  }
  power
}

#' Get sample size
#' @inheritParams designSampleSizeTMT
#' @inheritParams .calculatePower
#' @param alpha significance level
#' @param delta difference between means (?)
#' @importFrom stats qnorm
#' @keywords internal
.getNumSample = function(desiredFC, power, alpha, delta, median_sigma_error, 
                         median_sigma_subject, median_sigma_run){
  z_alpha = qnorm(1 - alpha / 2)
  z_beta = qnorm(power)
  aa = (delta / (z_alpha + z_beta)) ^ 2
  numSample = round(2 * (median_sigma_error + median_sigma_subject + median_sigma_run) / aa, 0)
  numSample
}
Vitek-Lab/MSstatsTMT documentation built on May 25, 2024, 4:12 p.m.