R/HostGeneCount.R

#' @title HostGeneCount
#' 
#' @description
#' Count circRNA host gene expression from given BED6 format CircCoordinates.
#' 
#' @param CircCoordinates CircCoordinates output of DCC, or any other BED6 format files, input as data.frame
#' @param bamfiles A iterable list of bamfiles with full path.
#' @param ncore Number of core to use.
#' @param stranded Whether the library is prepared strand specific, default "unstranded". Accepted values are "unstranded","fs","ss", meaning "unstranded library", "first strand library" and "second strand library" respectively.
#' @param splicedOnly Only count reads that are spliced at the circRNA junction, default TRUE.
#' @export HostGeneCount
#' 

HostGeneCount <- function(CircCoordinates,bamfiles,ncore=2,stranded='unstranded',splicedOnly=TRUE){
	require(Rsamtools)
	require(BiocParallel)
	register(MulticoreParam(ncore),default=FALSE)

	if(stranded=='fs'){
		strand_info <- CircCoordinates[,6]
		message('First strand library.')
	}else{
		if(stranded=='ss'){
			strand_info <- CircCoordinates[,6]
			strand_info <- ifelse(strand_info=='+','-',strand_info)
			message('Second strand library.')
		}else{
			strand_info <- '*'
			message('Unstranded library.')
		}
	}


	CircCoordinates <- data.frame(CircCoordinates)

	if(is.character(bamfiles)) bamfiles = c(bamfiles)
	if(any(!sapply(bamfiles, file.exists))){
		stop('One or more bamfile do not exist!')
	}
	if(any(!sapply(paste0(bamfiles,'.bai'), file.exists))){
		stop('One or more bamfile index do not exist!')
	}

	# setup the criteria/filtering for host gene reads counting 
	flag <- scanBamFlag(isNotPassingQualityControls=FALSE, isDuplicate=FALSE,isSecondaryAlignment=FALSE,isUnmappedQuery=FALSE)
	left <- CircCoordinates[,c(1,2,2,4,5,6)]
	colnames(left) <- c('chr','start','end','Gene','strand')
	which_left <- as(left,'GRanges')
	right <- CircCoordinates[,c(1,3,3,4,5,6)]
	colnames(right) <- c('chr','start','end','Gene','strand')
	which_right <- as(right,'GRanges')

	## Do reads counting
	# Count junction
	left_count <- bplapply(bamfiles,function(bamfile) scanBam(bamfile,param=ScanBamParam(flag=flag,which=which_left,what=c('strand','cigar'))))
	right_count <- bplapply(bamfiles,function(bamfile) scanBam(bamfile,param=ScanBamParam(flag=flag,which=which_right,what=c('strand','cigar'))))
	left_counts = bplapply(left_count, function(one_left_count){
				one_left_count <- read_scanBam(one_left_count,strand=strand_info)

			})
	left_counts <- do.call(cbind,left_counts)

	right_counts = bplapply(right_count, function(one_right_count){
				one_right_count <- read_scanBam(one_right_count,strand=strand_info)
			})
	right_counts <- do.call(cbind,right_counts)

	counts <- ceiling((left_counts+right_counts)/2)
	colnames(counts) <- sapply(basename(bamfiles),function(x) strsplit(x,'\\.')[[1]][1])
	counts <- cbind(CircCoordinates,counts)
	return(counts)
}

#' Read scanBam ouput list object.
#' @param scanBam_result the result list object generated by Rsamtools::scanBam function
#' @param strand a vector of strand information, need to be the same length as the "which" input option of sanBam (the number of circRNA subjected to host gene counting).
#'        CircTest needs this information to determine which strand of the reads should be counted.
#' @param splicedOnly Only count reads that are spliced at the circRNA junction. Default to be TRUE.
#' 
read_scanBam <- function(scanBam_result,strand=strands,splicedOnly=TRUE){
	scanBam_result <- mapply(function(x,y) c(x,gene_strand=y),scanBam_result,as.list(as.character(strand)),SIMPLIFY=F)
	counts <- lapply(
			scanBam_result,function(x) {
				      read_one_scanBam(x,strand=x$gene_strand,splicedOnly=splicedOnly)
                      })
	counts <- unlist(counts)
	return(counts)
}


##' Read one element of the result of sanBam function from Rsamtools. This function Called by read_scanBam.
#' @param scanBam_one_result one element of the sanBam result, corresponding to one genomic region.
#' @param strand of the region, "+", "-" or "*".
#' @param splicedOnly Only count reads that are spliced at the circRNA junction. Default to be TRUE.
#' 
read_one_scanBam <- function(scanBam_one_result,strand='+',splicedOnly=TRUE){
	# Only count spliced reads
	spliced <- grepl('N',scanBam_one_result$cigar)
	if(strand=='*'){
		strandMatched <- TRUE
	}else{
		strandMatched <- scanBam_one_result$strand==strand
	}
	count <- sum(spliced & strandMatched)
	return(count)
}
dieterich-lab/CircTest documentation built on May 15, 2019, 8:29 a.m.