R/SampleSizeCalculation.R

Defines functions designSampleSize designSampleSizePlots

Documented in designSampleSize designSampleSizePlots

#############################################
## designSampleSize
#############################################

#' @export
designSampleSize <- function(data=data,
						desiredFC=desiredFC,
						FDR=0.05,
						numSample=TRUE,
						power=0.9) {
  
  	labeled <- FALSE
  	scopeOfBioReplication <- "expanded"
  	interference <- TRUE
  	equalFeatureVar <- TRUE
  
  	## save process output in each step
  	allfiles <- list.files()
  	filenaming <- "msstats"
  
  	if (length(grep(filenaming, allfiles)) == 0) {
    
    	finalfile <- "msstats.log"
    	processout <- NULL
    
  	} else {
    
    	num <- 0
    	finalfile <- "msstats.log"
    
    	while(is.element(finalfile, allfiles)) {
      		num <- num + 1
      		lastfilename <- finalfile ## in order to rea
      		finalfile <- paste(paste(filenaming, num, sep="-"), ".log", sep="")
    	}
    
    	finalfile <- lastfilename
    	processout <- as.matrix(read.table(finalfile, header=T, sep="\t"))
  	}
  
  	processout <- rbind(processout, as.matrix(c(" ", " ", "MSstats - designSampleSize function", " "), ncol=1))
  
  	processout <- rbind(processout, c(paste("Desired fold change = ", paste(desiredFC, collapse=" - "), sep="")))
  	processout <- rbind(processout, c(paste("FDR = ", FDR, sep="")))
  	processout <- rbind(processout, c(paste("Power = ", power, sep="")))
  
  	write.table(processout, file=finalfile, row.names=FALSE)
  
  	## for label-free experiment
  	#if (!labeled) {
    
    	sigma.error <- NULL	
    	VarComponent <- data.frame(Protein=seq(1, length(data)), Error=NA, Subject=NA, GroupBySubject=NA)
    
    	for (i in 1:length(data)) {
      
      		# note: when run is fixed, we can obtain the same variance of error for both case-control and time course studies.
      
      		fit.full <- data[[i]]
      
      		## if fit.full==NA (class(fit.full)=="try-error)
      		if (is.null(fit.full)) {
        		## !!!!! but if we have NULL for last protein?
        		next
        
      		} else {
        
        		## get variance component
        
        		if (class(fit.full) != "mer") {
          			VarComponent[i, "Error"] <- summary(fit.full)$sigma^2
        		} else {
          			stddev  <-  c(sapply(VarCorr(fit.full), function(el) attr(el, "stddev")),attr(VarCorr(fit.full), "sc"))
         	 		VarComponent[i, "Error"] <- stddev[names(stddev) == ""]^2
         	 	
          			if (sum(names(stddev) %in% "SUBJECT_NESTED.(Intercept)") > 0) {
            			VarComponent[i, "Subject"] <- stddev[names(stddev) == "SUBJECT_NESTED.(Intercept)"]^2
          			}
          			if (sum(names(stddev) %in% "SUBJECT.(Intercept)") > 0) {
            			VarComponent[i, "Subject"] <- stddev[names(stddev) == "SUBJECT.(Intercept)"]^2
          			}
          			if (sum(names(stddev) %in% "SUBJECT:GROUP.(Intercept)") > 0) {
            			VarComponent[i, "GroupBySubject"] <- stddev[names(stddev) == "SUBJECT:GROUP.(Intercept)"]^2
          			}
        		}
      		}  
    	} ## end-loop
    
    	#		VarComponent[is.na(VarComponent)] <- 0	
    	## for label-free DDA, there are lots of missingness and lots of zero SE. So, remove NA SE.
    	median.sigma.error <- median(VarComponent[, "Error"], na.rm=TRUE)
    
   	 	if (sum(!is.na(VarComponent[, "GroupBySubject"])) > 0) {
    	
    		median.sigma.subject <- median(VarComponent[, "GroupBySubject"], na.rm=TRUE)
      
    	} else {
      		if (sum(!is.na(VarComponent[, "Subject"])) > 0) {
        		median.sigma.subject <- median(VarComponent[, "Subject"], na.rm=TRUE)
     		} else {
        		median.sigma.subject <- 0
      		}
    	}
    
    	##
    	processout <- rbind(processout, c("Calculated variance component. - okay"))
    	write.table(processout, file=finalfile, row.names=FALSE)
        
    	## power calculation
    	if (isTRUE(power)) {
      		delta <- log2(seq(desiredFC[1], desiredFC[2], 0.025))
      		desiredFC <- 2^delta
      		m0_m1=99
     		t <- delta/sqrt(2*median.sigma.error/numSample+median.sigma.subject/numSample)
      		#t <- delta/sqrt(2*median.sigma.error/numPep/numTran/numSample+median.sigma.subject/numSample)
     		powerTemp <- seq(0, 1, 0.01)
      
      		power <- NULL
      		for(i in 1:length(t)) {
       	 		diff <- qnorm(powerTemp)+qnorm(1-powerTemp*FDR/(1+(1-FDR)*m0_m1)/2)-t[i]
        		min(abs(diff), na.rm=TRUE)
        		power[i] <- powerTemp[order(abs(diff))][1]
      		}
      
      		CV <- round((2*median.sigma.error/(numSample)+median.sigma.subject/numSample)/desiredFC, 3)
     
      		###
     		processout <- rbind(processout, c("Power is calculated. - okay"))
      		write.table(processout, file=finalfile, row.names=FALSE)
      
      		out <- data.frame(desiredFC, numSample, FDR, power=power, CV)

      		return(out)
    	}	
    
		if (is.numeric(power)) {
      
      		## Large portion of proteins are not changing
      		m0_m1=99 ## it means m0/m1=99, m0/(m0+m1)=0.99
      		alpha <- power*FDR/(1+(1-FDR)*m0_m1)
      
      		## Num Sample calculation
      		if (isTRUE(numSample)) {
        		delta <- log2(seq(desiredFC[1], desiredFC[2], 0.025))
        		desiredFC <- 2^delta
        		z_alpha <- qnorm(1-alpha/2)
        		z_beta <- qnorm(power)
        		aa <- (delta/(z_alpha+z_beta))^2
        		numSample <- round((2*median.sigma.error+median.sigma.subject)/aa,0)
        		CV <- round((2*median.sigma.error/(numSample)+median.sigma.subject/numSample)/desiredFC,3)

	  			##
        		processout <- rbind(processout, c("The number of sample is calculated. - okay"))
        		write.table(processout, file=finalfile, row.names=FALSE)
        
        		out <- data.frame(desiredFC,numSample,FDR,power,CV)
       	 		return(out)
      		}
      
		} # when power is numeric
  	#} ## label-free
}


#############################################
## designSampleSizePlots
#############################################
#' @export
designSampleSizePlots <- function(data=data) {
  
  	if (length(unique(data$numSample)) > 1) index <- "numSample"
  	if (length(unique(data$power)) > 1) index <- "power"
  
  	if (length(unique(data$numSample)) == 1 & length(unique(data$power)) == 1) index <- "numSample"
  
  	text.size <- 1.2	
  	axis.size <- 1.3	
  	lab.size <- 1.7
  
  	if (index == "numSample") {
    
    	with(data, {
      		plot(desiredFC, numSample, lwd=2, xlab="", ylab="", cex.axis=axis.size, type="l", xaxt="n")})
    		axis(1, at=seq(min(data$desiredFC), max(data$desiredFC), 0.05), labels=seq(min(data$desiredFC), max(data$desiredFC), 0.05), cex.axis=axis.size)
    		axis(3, at=seq(min(data$desiredFC), max(data$desiredFC), 0.05), labels=data$CV[which(data$desiredFC %in% seq(min(data$desiredFC), max(data$desiredFC), 0.05))], cex.axis=axis.size)
    		mtext("Coefficient of variation, CV", 3, line=2.5, cex=lab.size)
    		mtext("Desired fold change", 1, line=3.5, cex=lab.size)
    		mtext("Minimal number of biological replicates", 2, line=2.5, cex=lab.size)
    		legend("topright", c(paste("FDR is", unique(data$FDR), sep=" "), paste("Statistical power is", unique(data$power), sep=" ")), bty="n", cex=text.size)
  	}
  
   if (index == "power") {
    
    	with(data, {
      		plot(desiredFC, power, lwd=2, xlab="", ylab="", cex.axis=axis.size, type="l", xaxt="n")})
    		axis(1, at=seq(min(data$desiredFC), max(data$desiredFC), 0.05), labels=seq(min(data$desiredFC), max(data$desiredFC), 0.05), cex.axis=axis.size)
    		mtext("Desired fold change", 1, line=3.5, cex=lab.size)
    		mtext("Power", 2, line=2.5, cex=lab.size)
    		legend("bottomright", c(paste("Number of replicates is", unique(data$numSample), sep=" "), paste("FDR is", unique(data$FDR), sep=" ")), bty="n", cex=text.size)
  	}
}
lindsaypino/MSstats-patch documentation built on May 24, 2019, 6 p.m.