R/bootstrap.enrichment.test.R

Defines functions bootstrap.enrichment.test

Documented in bootstrap.enrichment.test

#' Bootstrap celltype enrichment test
#'
#' \code{bootstrap.enrichment.test} takes a genelist and a single cell type transcriptome dataset
#' and determines the probability of enrichment and fold changes for each cell type.
#'
#' @param sct_data List generated using \code{\link{read_celltype_data}}
#' @param mouse.hits Array of MGI gene symbols containing the target gene list. Not required if geneSizeControl=TRUE
#' @param mouse.bg Array of MGI gene symbols containing the background gene list. Not required if geneSizeControl=TRUE
#' @param human.hits Array of HGNC gene symbols containing the target gene list. Not required if geneSizeControl=FALSE
#' @param human.bg Array of HGNC gene symbols containing the background gene list. Not required if geneSizeControl=FALSE
#' @param reps Number of random gene lists to generate (default=100 but should be over 10000 for publication quality results)
#' @param sub a logical indicating whether to analyse sub-cell type annotations (TRUE) or cell-type annotations (FALSE). Default is FALSE.
#' @param geneSizeControl a logical indicating whether you want to control for GC content and transcript length. Recommended if the gene list originates from genetic studies. Default is FALSE. If set to TRUE then human gene lists should be used rather than mouse.
#' @return A list containing three data frames:
#' \itemize{
#'   \item \code{results}: dataframe in which each row gives the statistics (p-value, fold change and number of standard deviations from the mean) associated with the enrichment of the stated cell type in the gene list
#'   \item \code{hit.cells}: vector containing the summed proportion of expression in each cell type for the target list
#'   \item \code{bootstrap_data}: matrix in which each row represents the summed proportion of expression in each cell type for one of the random lists
#' }
#' @examples
#' # Load the single cell data
#' data(celltype_data)
#'
#' # Set the parameters for the analysis
#' reps=100 # <- Use 100 bootstrap lists so it runs quickly, for publishable analysis use >10000
#' subCellStatus=0 # <- Use subcell level annotations (i.e. Interneuron type 3)
#' if(subCellStatus==1){subCellStatus=TRUE;cellTag="SubCells"}
#' if(subCellStatus==0){subCellStatus=FALSE;cellTag="FullCells"}
#'
#' # Load the gene list and get human orthologs
#' data("example_genelist")
#' data("mouse_to_human_homologs")
#' m2h = unique(mouse_to_human_homologs[,c("HGNC.symbol","MGI.symbol")])
#' mouse.hits = unique(m2h[m2h$HGNC.symbol %in% example_genelist,"MGI.symbol"])
#' human.hits = unique(m2h[m2h$HGNC.symbol %in% example_genelist,"HGNC.symbol"])
#' human.bg = unique(setdiff(m2h$HGNC.symbol,human.hits))
#' mouse.bg  = unique(setdiff(m2h$MGI.symbol,mouse.hits))
#'
#' # Bootstrap significance testing, without controlling for transcript length and GC content
#' full_results = bootstrap.enrichment.test(sct_data=celltype_data,mouse.hits=mouse.hits,
#'   mouse.bg=mouse.bg,reps=reps,sub=subCellStatus)
#'
#' # Bootstrap significance testing controlling for transcript length and GC content
#' full_results = bootstrap.enrichment.test(sct_data=celltype_data,human.hits=human.hits,
#'   human.bg=human.bg,reps=reps,sub=subCellStatus,geneSizeControl=TRUE)
#' @export
# @import biomaRt
# @importFrom reshape2 melt
# @import plyr
bootstrap.enrichment.test <- function(sct_data=NA,mouse.hits=NA,mouse.bg=NA,human.hits=NA,human.bg=NA,reps = 100,sub=FALSE,geneSizeControl=FALSE){
    # CHECK THE ARGUMENTS ARE PROPERLY STRUCTURED
    if(geneSizeControl==TRUE){
        if(unique(is.na(human.hits))|unique(is.na(human.bg))){
            stop("ERROR: If geneSizeControl=TRUE then human gene lists must be provided in human.hits and human.bg")
        }
    }
    if(geneSizeControl==FALSE){
        if(unique(is.na(mouse.hits))|unique(is.na(mouse.bg))){
            stop("ERROR: If geneSizeControl=FALSE then mouse gene lists must be provided in mouse.hits and mouse.bg")
        }
    }
    if(unique(is.na(sct_data))){stop("ERROR: must provide valid single cell data generated using read_celltype_data() function")}

    # IF USING GENESIZE AND GC-CONTENT MATCHING, THEN GENERATE THE SAMPLE LISTS
	if(geneSizeControl==TRUE){
		#humanGenelist=human.hits
		#human.bg=human.bg
		#mouseGenes=unique(c(mouse.hits,mouse.bg))
		control_related = prepare.genesize.control.network(human.hits,human.bg,unique(c(mouse.hits,mouse.bg)),numBOOT=reps)
		control_network = control_related[["list_network"]]
		hitGenes = control_related[["hitGenes"]]
		if(length(hitGenes)!=dim(control_network)[2]){
			stop("ERROR! AFTER CALCULATING BOOTSTRAPPING NETWORK WITH LENGTH + GC CONTROLS, size of list_network is not same length as hitGenes")
		}
	}else{
	    hitGenes = mouse.hits
	    nonHits  = mouse.bg
	    combinedGenes = c(mouse.hits,mouse.bg)
	}

    # CHECK THAT SUFFICIENT GENES HAVE BEEN PROVIDED IN THE TARGET LIST
    if(length(hitGenes)<=3){
        stop("ERROR: At least four genes are required to test for enrichment")
    }

	# GET WEIGHTING FOR EACH CELL TYPE FOR HIT LIST (hit.cells) AND A MATRIX TO STORE THE BOOTSTRAP WEIGHTINGS (bootstrap_data)
	if(sub==TRUE){
		bootstrap_data = matrix(0,ncol=length(colnames(sct_data$subcell_dists)),nrow=reps)
		cells = unique(colnames(sct_data$subcell_dists))
		hit.cells = subcell.list.dist(hitGenes,sct_data)
	}else{
		bootstrap_data = matrix(0,ncol=length(colnames(sct_data$cell_dists)),nrow=reps)
		cells = unique(colnames(sct_data$cell_dists))
		hit.cells = cell.list.dist(hitGenes,sct_data)
	}
	for(s in 1:reps){
		if(geneSizeControl==FALSE){
			bootstrap_set       = sample(combinedGenes,length(hitGenes))
		}else{
			bootstrap_set       = control_network[s,]
		}
		if(sub==TRUE){
			bootstrap_data[s,] = subcell.list.dist(bootstrap_set,sct_data)	# This is where it crashed before... with s=149
		}else{
			bootstrap_data[s,] = cell.list.dist(bootstrap_set,sct_data)
		}
	}

	# EXTRACTING THE DETAILS:
	# - CALCULATING P-VALUE, FOLD CHANGE AND MARKERS ETC
	count=0
	for(ct in cells){
	    print(ct)
		count=count+1
		# For cell type 'ct' get the bootstrap and target list values
		if(sub==TRUE){ct_boot_dist = bootstrap_data[,colnames(sct_data$subcell_dists)==ct] ; hit_sum = hit.cells[colnames(sct_data$subcell_dists)==ct]  }
		if(sub==FALSE){ct_boot_dist = bootstrap_data[,colnames(sct_data$cell_dists)==ct] ; hit_sum = hit.cells[colnames(sct_data$cell_dists)==ct]  }
		# Get propability and fold change of enrichment
		p=sum(ct_boot_dist>=hit_sum)/reps
		print(p)
		fold_change  = hit_sum/mean(ct_boot_dist)
		sd_from_mean = (hit_sum-mean(ct_boot_dist))/sd(ct_boot_dist)
		if(sub==TRUE){
			ct_root = gsub("_.*","",ct)
		}else{ct_root=ct}
		if(p<0.05){
			# If cell type is significant, print the contributing genes:
			print(sprintf("Fold enrichment: %s",fold_change))
			print(sprintf("Standard deviations from mean: %s", sd_from_mean))
		}
		if(count==1){
			results = data.frame(CellType=ct,subcells=sub,p=p,fold_change=fold_change,sd_from_mean=sd_from_mean)
		}else{
			results = rbind(results,data.frame(CellType=ct,subcells=sub,p=p,fold_change=fold_change,sd_from_mean=sd_from_mean))
		}
		print("")
	}

	full_results = list(results=results,hit.cells=hit.cells,bootstrap_data=bootstrap_data)
	return(full_results)
}

Try the EWCE package in your browser

Any scripts or data that you put into this service are public.

EWCE documentation built on May 31, 2017, 3:16 p.m.