R/SampleSizeCalculation.R
In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

Documented in designSampleSize designSampleSizePlots

#############################################
## designSampleSize
#############################################

#' @export
designSampleSize <- function(data=data,
                             desiredFC=desiredFC,
                             FDR=0.05,
                             numSample=TRUE,
                             power=0.9) {
  
  	labeled <- FALSE
  	scopeOfBioReplication <- "expanded"
  	interference <- TRUE
  	equalFeatureVar <- TRUE
  
  	## save process output in each step
  	allfiles <- list.files()
  	filenaming <- "msstats"
  
  	if (length(grep(filenaming, allfiles)) == 0) {
    
    	finalfile <- "msstats.log"
    	processout <- NULL
    
  	} else {
    
    	num <- 0
    	finalfile <- "msstats.log"
    
    	while(is.element(finalfile, allfiles)) {
      		num <- num + 1
      		lastfilename <- finalfile ## in order to rea
      		finalfile <- paste(paste(filenaming, num, sep="-"), ".log", sep="")
    	}
    
    	finalfile <- lastfilename
    	processout <- as.matrix(read.table(finalfile, header=TRUE, sep="\t"))
  	}
  
  	processout <- rbind(processout, as.matrix(c(" ", " ", "MSstats - designSampleSize function", " "), ncol=1))
  
  	processout <- rbind(processout, c(paste0("Desired fold change = ", paste(desiredFC, collapse=" - "))))
  	processout <- rbind(processout, c(paste0("FDR = ", FDR)))
  	processout <- rbind(processout, c(paste0("Power = ", power)))
  
  	write.table(processout, file=finalfile, row.names=FALSE)
  
  	## for label-free experiment
  	#if (!labeled) {
    
    sigma.error <- NULL	
    VarComponent <- data.frame("Protein"=seq(1, length(data)), 
                               "Error"=NA, 
                               "Subject"=NA, 
                               "GroupBySubject"=NA)
    
    for (i in 1:length(data)) {
      
      	# note: when run is fixed, we can obtain the same variance of error for both case-control and time course studies.
      
      	fit.full <- data[[i]]
      
      	## if fit.full==NA (class(fit.full)=="try-error)
      	if (is.null(fit.full)) {
        	## !!!!! but if we have NULL for last protein?
        	next
        
      	} else {
        
        	## get variance component
        
        	if (class(fit.full) != "lmerMod") {
          		VarComponent[i, "Error"] <- summary(fit.full)$sigma^2
        	} else {
          		stddev  <-  c(sapply(VarCorr(fit.full), function(el) attr(el, "stddev")), attr(VarCorr(fit.full), "sc"))
         	 	VarComponent[i, "Error"] <- stddev[names(stddev) == ""]^2
         	 	
          		if (sum(names(stddev) %in% "SUBJECT_NESTED.(Intercept)") > 0) {
            		VarComponent[i, "Subject"] <- stddev[names(stddev) == "SUBJECT_NESTED.(Intercept)"]^2
          		}
          		if (sum(names(stddev) %in% "SUBJECT.(Intercept)") > 0) {
            		VarComponent[i, "Subject"] <- stddev[names(stddev) == "SUBJECT.(Intercept)"]^2
          		}
          		if (sum(names(stddev) %in% "SUBJECT:GROUP.(Intercept)") > 0) {
            		VarComponent[i, "GroupBySubject"] <- stddev[names(stddev) == "SUBJECT:GROUP.(Intercept)"]^2
          		}
        	}
      	}  
    } ## end-loop
    
    ## VarComponent[is.na(VarComponent)] <- 0	
    ## for label-free DDA, there are lots of missingness and lots of zero SE. So, remove NA SE.
    median.sigma.error <- median(VarComponent[, "Error"], na.rm=TRUE)
    
   	if (sum(!is.na(VarComponent[, "GroupBySubject"])) > 0) {
    	
    	median.sigma.subject <- median(VarComponent[, "GroupBySubject"], na.rm=TRUE)
      
    } else {
      	if (sum(!is.na(VarComponent[, "Subject"])) > 0) {
        	median.sigma.subject <- median(VarComponent[, "Subject"], na.rm=TRUE)
     	} else {
        	median.sigma.subject <- 0
      	}
    }
    
    ##
    processout <- rbind(processout, "Calculated variance component. - okay")
    write.table(processout, file=finalfile, row.names=FALSE)
        
    ## power calculation
    if (isTRUE(power)) {
      	delta <- log2(seq(desiredFC[1], desiredFC[2], 0.025))
      	desiredFC <- 2^delta
      	m0_m1 <- 99
     	t <- delta / sqrt(2 * (median.sigma.error / numSample + median.sigma.subject / numSample))
      	#t <- delta/sqrt(2*median.sigma.error/numPep/numTran/numSample+median.sigma.subject/numSample)
     	powerTemp <- seq(0, 1, 0.01)
      
      	power <- NULL
      	for (i in 1:length(t)) {
       	 	diff <- qnorm(powerTemp) + qnorm(1 - powerTemp * FDR / (1 + (1 - FDR) * m0_m1) / 2) - t[i]
        	min(abs(diff), na.rm=TRUE)
        	power[i] <- powerTemp[order(abs(diff))][1]
      	}
      
      	CV <- round( (2 * (median.sigma.error / numSample + median.sigma.subject / numSample)) / desiredFC, 3)
     
      	###
     	processout <- rbind(processout, 
     	                    "Power is calculated. - okay")
      	write.table(processout, file=finalfile, row.names=FALSE)
      
      	out <- data.frame(desiredFC, numSample, FDR, power=power, CV)

      	return(out)
    }	
    
	if (is.numeric(power)) {
      
      	## Large portion of proteins are not changing
      	m0_m1 <- 99 ## it means m0/m1=99, m0/(m0+m1)=0.99
      	alpha <- power * FDR / (1 + (1 - FDR) * m0_m1)
      
      	## Num Sample calculation
      	if (isTRUE(numSample)) {
        	delta <- log2(seq(desiredFC[1], desiredFC[2], 0.025))
        	desiredFC <- 2^delta
        	z_alpha <- qnorm(1-alpha/2)
        	z_beta <- qnorm(power)
        	aa <- (delta/(z_alpha+z_beta))^2
        	numSample <- round(2 * (median.sigma.error + median.sigma.subject) / aa, 0)
        	CV <- round(2 * (median.sigma.error / numSample + median.sigma.subject / numSample) / desiredFC, 3)

	  		##
        	processout <- rbind(processout, 
        	                    "The number of sample is calculated. - okay")
        	write.table(processout, file=finalfile, row.names=FALSE)
        
        	out <- data.frame(desiredFC,numSample,FDR,power,CV)
       	 	return(out)
      	}
	} # when power is numeric
  	#} ## label-free
}


#############################################
## designSampleSizePlots
#############################################
#' @export
designSampleSizePlots <- function(data=data) {
  
  	if (length(unique(data$numSample)) > 1) {
  	    index <- "numSample"
  	}
  	if (length(unique(data$power)) > 1) {
  	    index <- "power"
  	}
  
  	if (length(unique(data$numSample)) == 1 & length(unique(data$power)) == 1) {
  	    index <- "numSample"
  	}
  
  	text.size <- 1.2	
  	axis.size <- 1.3	
  	lab.size <- 1.7
  
  	if (index == "numSample") {
    
        plot(data$desiredFC, data$numSample, 
             lwd=2, xlab="", ylab="", 
             cex.axis=axis.size, type="l", xaxt="n")
  	    axis(1, at=seq(min(data$desiredFC), max(data$desiredFC), 0.05), 
  	         labels=seq(min(data$desiredFC), max(data$desiredFC), 0.05),
  	         cex.axis=axis.size)
  	    #axis(3, at=seq(min(data$desiredFC), max(data$desiredFC), 0.05),
  	    #     labels=data$CV[which(data$desiredFC %in% seq(min(data$desiredFC), max(data$desiredFC), 0.05))],
  	    #     cex.axis=axis.size)
  	    #mtext("Coefficient of variation, CV", 3, line=2.5, cex=lab.size)
  	    mtext("Desired fold change", 1, line=3.5, cex=lab.size)
  	    mtext("Minimal number of biological replicates", 2, line=2.5, cex=lab.size)
  	    legend("topright",
  	           c(paste("FDR is", unique(data$FDR)),
  	             paste("Statistical power is", unique(data$power))),
  	           bty="n",
  	           cex=text.size)
  	}
  
    if (index == "power") {
    
    	plot(data$desiredFC, data$power, 
    	     lwd=2, xlab="", ylab="", 
    	     cex.axis=axis.size, type="l", xaxt="n")
        axis(1, at=seq(min(data$desiredFC), max(data$desiredFC), 0.05), 
             labels=seq(min(data$desiredFC), max(data$desiredFC), 0.05), 
             cex.axis=axis.size)
        mtext("Desired fold change", 1, line=3.5, cex=lab.size)
        mtext("Power", 2, line=2.5, cex=lab.size)
        legend("bottomright", 
               c(paste("Number of replicates is", unique(data$numSample)), 
                 paste("FDR is", unique(data$FDR))), 
               bty="n", 
               cex=text.size)
  	}
}

Any scripts or data that you put into this service are public.

MSstats documentation built on Feb. 28, 2021, 2:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

MSstats
Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

R/SampleSizeCalculation.R
In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

Defines functions designSampleSizePlots designSampleSize

Documented in designSampleSize designSampleSizePlots

Try the MSstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MSstats Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

R/SampleSizeCalculation.R In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

Defines functions designSampleSizePlots designSampleSize

Documented in designSampleSize designSampleSizePlots

Try the MSstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MSstats
Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

R/SampleSizeCalculation.R
In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments